library(tidyverse)
library(ape)
library(phangorn)
library(pegas)
#library(strataG)
library(knitr)
library(emoji)
library(coda)
library(ggmcmc)
library(perm)
library(Rmpfr)
library(spatstat.explore)
library(colorspace)
library(latex2exp)
library(ggridges)
# Function library
harvest.model.likelihoods <- function(workingDir = workingDir,
outfileName = "outfile.txt",
multilocus = T){
# this function harvests model marginal likelihoods for models calculated by
# the program migrate-n (Beerli & Felsenstein 2001).
# It takes as input a directory full of directories,
# each of which contains output from a migrate model, and is named
# after that model.
#initialize a data frame to take the values
modelMarglikes <- data.frame(model=character(),
thermodynamic=numeric(),
bezier.corrected=numeric(),
harmonic=numeric())
# loop through directories in the working directory, each of which is name
# after a different model
for(i in list.dirs(workingDir, full.names = F)[-1]){ #i<-"stepping.stone"
modelDir<-file.path(workingDir,i)
print(modelDir)
#scan in the outfile, separating at each newline
outfile<-scan(file=file.path(modelDir,outfileName),what="character",sep="\n")
#find the line with the likelihoods on it and split on runs of spaces
marglikeline <- grep("Scaling factor",outfile,value=F)-1
marglikeline <- strsplit(outfile[marglikeline],
"\\s+", perl = T)[[1]][3:5]
# if(length(marglikeline)==0){next}
marglikes <- c(i,marglikeline)
modelMarglikes <- rbind(modelMarglikes,marglikes, deparse.level = 2)
}
names(modelMarglikes) <- c("model","thermodynamic","bezier.corrected","harmonic")
modelMarglikes[2:4] <- sapply(modelMarglikes[2:4], as.numeric)
return(modelMarglikes)
}
bfcalcs<-function(df,ml="bezier.corrected"){
# This calculates log bayes factors on data frames output by
# harvest.model.likelihoods(), following Johnson and Omland (2004)
# You may choose the likelihood flavor with
# ml = "bezier.corrected", "thermodynamic" or "harmonic"
#df$thermodynamic <- as.numeric(df$thermodynamic)
#df$bezier.corrected <- as.numeric(df$bezier.corrected)
#df$harmonic <- as.numeric(df$harmonic)
mlcol <- df[[ml]]
bmvalue <- mlcol[which.max(mlcol)]
lbf <- 2*(mlcol-bmvalue)
choice <- rank(-mlcol)
modelprob <- exp(lbf/2)/sum(exp(lbf/2))
dfall <- cbind(df,lbf,choice,modelprob)
return(dfall)
}
migrants.per.gen<-function(x){
#a function for creating Nm vectors out of m and Theta vectors.
#x<-x[[1]]
m<-names(x)[which(grepl("M_",names(x)))] #names of m columns
#theta<-names(x)[which(grepl("Theta_",names(x)))] #names of theta columns
for(n in m){
t<-paste("Theta",strsplit(n,split="_")[[1]][3],sep="_")
x[,paste("Nm",strsplit(n,split="_")[[1]][2],strsplit(n,split="_")[[1]][3],sep="_")]<- x[,which(names(x)==n)]*x[,which(names(x)==t)]
#this hairy little statement makes a new column named "Nm_X_Y" and then fills it by multiplying the M_X_Y column by the Theta_Y column
}
return(x)
}
Rob Toonen asked me to take a look at this dataset sequenced and analyzed by ’Ale’alani Dudoit.
From Rob:
It is a zoanthid - Palythoa tuberculosa - that is most common in anthropogenically disturbed habitats. It seems like it could be a cool story, and she has plenty of SNPs or contigs to do whatever analysis you think would be most interesting to work up.
’Ale’a:
Mahalo for checking out our data Eric, would be great to try out Migrate-n and see what results come of it. A couple other folks in the lab had similar interesting fst/structure results with their data (corals and fish) where O’ahu is more genetically similar to Kure/Midway atoll than to neighboring islands. Would be interesting to see if we get any cool results as Rob mentioned with military transport b/w O’ahu and Midway during WWII. I can send along my TotalRawSNPs and Filtered vcf files through scp command if you have a destination path you can provide me. Please let me know what other info you need from me, happy to send along.
Eric:
So it turns out that dDocent (or actually an associated perl script) can make haplotypes, but it will need the bam files for each individual. Supposedly we have unlimited storage on Google Drive at PSU, so I’m going to test that. I’ve shared a folder that you can use to upload the bam files.
So, now I’ve received all .bam and .bai files for each individual, as well as locality metadata and total and filtered SNP datasets from ’Ale’a on Google Drive. I compressed these on my mac, and now need to transfer them to Argonaute. I will do this with the gdown python package (pip install gdown). This involves looking up the google id for each file in the URL of the share link and pasting that into gdown thusly
gdown https://drive.google.com/uc?id=19nagmzHPQgKE4PPlyHikcWc7lBZFMbXB --output ptuberculosa_metadata.csv
gdown https://drive.google.com/uc?id=19g9fMRdPH-_AG-l6kytu-lv6kd3qYpQD --output popmap
gdown https://drive.google.com/uc?id=1ASB4yOnwDzDhFPA7Pev4CyIl6M6SX8_P --output popmap.csv
gdown https://drive.google.com/uc?id=19ZsobdvBIs3bQukW0Q-ivpdlJEdGguns --output reference.fasta
gdown https://drive.google.com/uc?id=19XNrJvZKMQ3L7qTIuExajc5KY6-gj3P8 --output mm0.9minq30mac3thin500.recode.vcf
gdown https://drive.google.com/uc?id=19vth-3IMkfMfxq4cp0QLOUgC2yFqnZ3C --output TotalRawSNPs.vcf
gdown https://drive.google.com/uc?id=1I0ib-QX4eyedctD951os6gvAMUe761JU --output bam_files.zip
gdown https://drive.google.com/uc?id=1ES3yMzZS8JkWROiE1jdfTp6KBjxDdCXO --folder --remaining-ok --output firstbatch
‘Ale’ already filtered the SNPs quite a bit, but I need to start that process over because coalescent programs are sensitive to ascertainment bias. Also, need to knit them back into haplotypes. As Peter Beerli says on page 15 of the migrate-n manual:
We use a rather restrictive ascertainment models for SNPs ?. A better approach than using SNPs is the use of short reads which may or many not contain SNPs. I find that SNPs are an inferior datatype because commonly researchers are adding criteria such as a minor SNP allele must occur at a frequency higher than x, and singletons are excluded etc.
We have found ALL variable sites and use them even if there are only a few members of another alleles present. In principal it is as you would sequence a stretch of DNA and then remove the invariant sites. Each stretch is treated as completely linked. You can combine many of such “loci” to improve your estimates.
I’ve installed dDocent into a dedicated environment following instructions here.
conda activate ddocent_env
Following the tutorial, but altering filtering parameters. Keeping all SNPs with even a single occurence (Minor Allele Count = 1), but they must occur in 90% of individuals (mac 1) and be of high quality (minQ 30). We are keeping genotypes with as few as 3 reads. This is because evidence to keep these SNPs is based on their existence in multiple individuals.
vcftools --vcf ../TotalRawSNPs.vcf --max-missing 0.1 --mac 1 --minQ 30 --minDP 3 --recode --recode-INFO-all --out rawg.1mac1dp3
After filtering, kept 93 out of 93 Individuals Outputting VCF file… After filtering, kept 28128 out of a possible 52804 Sites Run Time = 9.00 seconds
Run this error checking script to see the potential impact of shallow-sequenced genotypes
ErrorCount.sh rawg.1mac1dp3.recode.vcf
<!-- This script counts the number of potential genotyping errors due to low read depth -->
<!-- It report a low range, based on a 50% binomial probability of observing the second allele in a heterozygote and a high range based on a 25% probability. -->
<!-- Potential genotyping errors from genotypes from only 1 read range from 0.0 to 0.0 -->
<!-- Potential genotyping errors from genotypes from only 2 reads range from 0.0 to 0.0 -->
<!-- Potential genotyping errors from genotypes from only 3 reads range from 3808.875 to 12797.82 -->
<!-- Potential genotyping errors from genotypes from only 4 reads range from 3860.25 to 19517.424 -->
<!-- Potential genotyping errors from genotypes from only 5 reads range from 913.78125 to 6930 -->
<!-- 93 number of individuals and 28128 equals 2615904 total genotypes -->
<!-- Total genotypes not counting missing data 2508832 -->
<!-- Total potential error rate is between 0.0034210765208670807 and 0.015642834593946504 -->
<!-- SCORCHED EARTH SCENARIO -->
<!-- WHAT IF ALL LOW DEPTH HOMOZYGOTE GENOTYPES ARE ERRORS????? -->
<!-- The total SCORCHED EARTH error rate is 0.04841934414101861. -->
Look at missing data per individual
vcftools --vcf rawg.1mac1dp3.recode.vcf --missing-indv
mawk '!/IN/' out.imiss | cut -f5 > totalmissing
gnuplot << \EOF
set terminal dumb size 120, 30
set autoscale
unset label
set title "Histogram of % missing data per individual"
set ylabel "Number of Occurrences"
set xlabel "% of missing data"
#set yr [0:100000]
binwidth=0.01
bin(x,width)=width*floor(x/width) + binwidth/2.0
plot 'totalmissing' using (bin($1,binwidth)):(1.0) smooth freq with boxes
pause -1
Histogram of % missing data per individual
20 +-----------------------------------------------------------------------------------------------------------+
| * * + + + + + + + |
| * ** 'totalmissing' using (bin($1,binwidth)):(1.0) ******* |
| * ** |
| * ** |
| * ** |
15 |-+ * ** +-|
| * ** |
| ** ** |
| ** ** |
| ** ** |
10 |-+ ** ** +-|
| ** ** |
| ** *** |
| ** *** |
| ** *** |
| ** ***** |
5 |-+ ** *** * +-|
| *** *** **** |
| *** *** ** * |
| *** *** ** * |
| *** *** ** ******* |
|***** *** ** *** * ******************************************************************************* |
0 +-----------------------------------------------------------------------------------------------------------+
0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8
% of missing data
‘Ale’ has probably removed any really bad individuals already. I’ll keep all of these for now.
Now filter based on missingness within populations. Puritz wrote a script for this that creates lots of output, so I created ./filterpop directory to run it in. I am removing loci that are missing 10% of data in any of 10 populations. I altered popmap to remove spaces from pop names too.
pop_missing_filter.sh ../rawg.1mac1dp3.recode.vcf ../../popmap 0.1 10 rawg1mac1dp3allpops.1.vcf
This filter keeps loci with Allele Balance between 0.25 and 0.75. From Jon:
Allele balance is: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous Because RADseq targets specific locations of the genome, we expect that the allele balance in our data (for real loci) should be close to 0.5
So this will keep only really high quality SNPs
vcffilter -s -f "AB > 0.25 & AB < 0.75 | AB < 0.01" ./filterpops/rawg1mac1dp3allpops.1.vcf.recode.vcf > rawg.1mac1dp3allpop.1.ABfil.vcf
mawk '!/#/' *ABfil.vcf | wc -l
2756
Now remove SNPs that occur on both strands of a read. Jon:
Unless you are using super small genomic fragment or really long reads (MiSeq). A SNP should be covered only by forward or only reverse reads. The filter is based on proportions, so that a few extraneous reads won’t remove an entire locus…. In plain english, it’s keeping loci that have over 100 times more forward alternate reads than reverse alternate reads and 100 times more forward reference reads than reverse reference reads along with the reciprocal….That only removes a small proportion of loci, but these loci are likely to be paralogs, microbe contamination, or weird PCR chimeras.
```{basjh. eval = F} vcffilter -f “SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 100” -s rawg.1mac1dp3allpop.1.ABfil.vcf > rawg.1mac1dp3allpop.1.ABfil.FRfil.vcf
mawk ‘!/#/’ rawg.1mac1dp3allpop.1.ABfil.FRfil.vcf | wc -l 7
Hmmm... that's a bit too stringent... maybe these were done on a MiSeq?
> The next filter looks at the ratio of mapping qualities between reference and alternate alleles...The rationale here is that, again, because RADseq loci and alleles all should start from the same genomic location there should not be large discrepancy between the mapping qualities of two alleles.
```bash
vcffilter -f "MQM / MQMR > 0.9 & MQM / MQMR < 1.05" rawg.1mac1dp3allpop.1.ABfil.vcf > rawg.1mac1dp3allpop.1.ABfil.MQfil.vcf
mawk '!/#/' rawg.1mac1dp3allpop.1.ABfil.MQfil.vcf | wc -l
2512
Yet another filter that can be applied is whether or not their is a discrepancy in the properly paired status of for reads supporting reference or alternate alleles….Since de novo assembly is not perfect, some loci will only have unpaired reads mapping to them. This is not a problem. The problem occurs when all the reads supporting the reference allele are paired but not supporting the alternate allele. That is indicative of a problem.
vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR / PAIRED > 0.25 | PAIRED < 0.05 & PAIREDR < 0.05" -s rawg.1mac1dp3allpop.1.ABfil.MQfil.vcf > rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.vcf
mawk '!/#/' rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.vcf| wc -l
2141
dDocent_filters rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.vcf rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.dDFil.vcf
This script will automatically filter a FreeBayes generated VCF file using criteria related to site depth,
quality versus depth, strand representation, allelic balance at heterzygous individuals, and paired read representation.
The script assumes that loci and individuals with low call rates (or depth) have already been removed.
Contact Jon Puritz (jpuritz@gmail.com) for questions and see script comments for more details on particular filters
Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth
1192 of 2141
Are reads expected to overlap? In other words, is fragment size less than 2X the read length? Enter yes or no.
yes
Is this from a mixture of SE and PE libraries? Enter yes or no.
no
Number of additional sites filtered based on properly paired status
0 of 949
Number of sites filtered based on high depth and lower than 2*DEPTH quality score
78 of 949
Histogram of mean depth per site
30 +---------------------------------------------------------------------------------------------------------+
| + + **+ + + + + + + + + + + + + + + + + +|
| ** 'meandepthpersite' using (bin($1,binwidth)):(1.0) ******* |
| *** ** |
25 |-+ * * ** +-|
| ** **** ** |
| ** **** ** |
| ** **** ** |
20 |-+ ** ** **** ** +-|
| ** ** ** **** ** |
| ** ** ** ***** ** ** |
15 |*** ** ** ***** ** ** +-|
|**** ** ** ***** ** ** |
|**** ** ** ******* ** ** |
|**** ***** ******* ** ** ** |
10 |********** ************* *** ** +-|
|********** ************** **** ** ** |
|********** ************** **** ** ** |
|********** ********************** *** *** ***** |
5 |********** ******************** * ************ * *** +-|
|********** ******************** ************** ** * ******* ** |
|********** ******************** ******************** ****************** ************ *** *** *** *|
|********** ******************** ******************** ************ *** *** ****** * *** *** **** *********|
0 +---------------------------------------------------------------------------------------------------------+
10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110
Mean Depth
If distrubtion looks normal, a 1.645 sigma cutoff (~90% of the data) would be 10141.7555
The 95% cutoff would be 101
Would you like to use a different maximum mean depth cutoff than 101, yes or no
no
Number of sites filtered based on maximum mean depth
81 of 949
Number of sites filtered based on within locus depth mismatch
36 of 868
Total number of sites filtered
1309 of 2141
Remaining sites
832
Filtered VCF file is called rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.dDFil.vcf.FIL.recode.vcf
Filter stats stored in rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.dDFil.vcf.filterstats
(ddocent_env) ecrandall@Argonaute:~/eric_data/ptuberculosa/filtering$
This script apparently does many of the other steps that I did above manually, so I started from an earlier checkpoint
dDocent_filters rawg1mac1dp3allpops.1.vcf.recode.vcf rawg.1mac1dp3allpop.1.dDfil.vcf
This script will automatically filter a FreeBayes generated VCF file using criteria related to site depth,
quality versus depth, strand representation, allelic balance at heterzygous individuals, and paired read representation.
The script assumes that loci and individuals with low call rates (or depth) have already been removed.
Contact Jon Puritz (jpuritz@gmail.com) for questions and see script comments for more details on particular filters
Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth
23877 of 25261
Are reads expected to overlap? In other words, is fragment size less than 2X the read length? Enter yes or no.
yes
Is this from a mixture of SE and PE libraries? Enter yes or no.
no
Number of additional sites filtered based on properly paired status
197 of 1384
Number of sites filtered based on high depth and lower than 2*DEPTH quality score
1673 of 1187
Histogram of mean depth per site
60 +---------------------------------------------------------------------------------------------------------+
| + + + + + + + + + + + + + + + + + + + +|
| 'meandepthpersite' using (bin($1,binwidth)):(1.0) ******* |
| ** |
50 |-+ ** +-|
| ** |
| ** ** |
| ** ** ** |
40 |-+ ** ** ** +-|
| ** ** ** ** |
| ** *** *** ** ** |
30 |-+ *********** ***** +-|
| ** *********** ***** |
| ** ***************** |
| ******************** |
20 |-******************** ** +-|
|********************** *** |
|********************** *** |
|**************************** * |
10 |**************************** *** * +-|
|******************************** ** ** ***** ** |
|**************************************************** ** ** *** |
|************************************************************************** **** ** ** + *** ** +*****|
0 +---------------------------------------------------------------------------------------------------------+
12 18 24 30 36 42 48 54 60 66 72 78 84 90 96 102 108 114 120 126
Mean Depth
If distrubtion looks normal, a 1.645 sigma cutoff (~90% of the data) would be 11589.30825
The 95% cutoff would be 116
Would you like to use a different maximum mean depth cutoff than 116, yes or no
no
Number of sites filtered based on maximum mean depth
109 of 1187
Number of sites filtered based on within locus depth mismatch
21 of 1072
Total number of sites filtered
24210 of 25261
Remaining sites
1051
Filtered VCF file is called rawg.1mac1dp3allpop.1.dDfil.vcf.FIL.recode.vcf
Filter stats stored in rawg.1mac1dp3allpop.1.dDfil.vcf.filterstats
mv rawg.1mac1dp3allpop.1.dDfil.vcf.FIL.recode.vcf rawg.1mac1dp3allpop.1.dDfil.vcf
Now to filter by HWE
The next filter to apply is HWE. Heng Li also found that HWE is another excellent filter to remove erroneous variant calls. We don’t want to apply it across the board, since population structure will create departures from HWE as well. We need to apply this by population. I’ve included a perl script written by Chris Hollenbeck, one of the PhD student’s in my current lab that will do this for us.
Let’s filter our SNPs by population specific HWE First, we need to convert our variant calls to SNPs To do this we will use another command from vcflib called vcfallelicprimatives
vcfallelicprimitives rawg.1mac1dp3allpop.1.dDfil.vcf --keep-info --keep-geno > rawg.1mac1dp3allpop.1.dDfil.prim.vcf
This will decompose complex variant calls into phased SNP and INDEL genotypes and keep the INFO flags for loci and genotypes. Next, we can feed this VCF file into VCFtools to remove indels.
vcftools --vcf rawg.1mac1dp3allpop.1.dDfil.prim.vcf --remove-indels --recode --recode-INFO-all --out SNP.rawg.1mac1dp3allpop.1.dDfil
Outputting VCF file… After filtering, kept 1096 out of a possible 1126 Sites Run Time = 0.00 seconds
Now apply the HWE filter, in filterhwe folder
filter_hwe_by_pop.pl -v ../SNP.rawg.1mac1dp3allpop.1.dDfil.recode.vcf -p ../../popmap -o SNP.rawg.1mac1dp3allpop.1.dDfil.HWE -h 0.01 -c 0.1
<!-- Processing population: BigIsland (17 inds) -->
<!-- Processing population: FrenchFrigateShoals (9 inds) -->
<!-- Processing population: Kauai (15 inds) -->
<!-- Processing population: Kure (10 inds) -->
<!-- Processing population: MaroReef (5 inds) -->
<!-- Processing population: Maui (10 inds) -->
<!-- Processing population: Molokai (10 inds) -->
<!-- Processing population: Oahu (10 inds) -->
<!-- Processing population: Pearl_Hermes (4 inds) -->
<!-- Processing population: PioneerBanks (3 inds) -->
<!-- Outputting results of HWE test for filtered loci to 'filtered.hwe' -->
<!-- Kept 997 of a possible 1096 loci (filtered 99 loci) -->
Hmm… the HWE filter is not working great, likely because of low sample sizes. It’s leaving me with mostly loci without individuals with alternate homozygotes
conda deactivate
Chris Hollenbeck has created a script that can get haplotypes from dDocent output. First, gotta remove the -RG from the .bam and .bai files.
for f in *.bam; do mv "$f" "${f%-RG.bam}.bam" ; done
for f in *.bam.bai; do mv "$f" "${f%-RG.bam.bai}.bam.bai" ; done
Now, to install the rad_haplotyper environment following these instructions
hmmm … rad_haplotyper isn’t working because the Vcf Perl module changed its name to VCF… breaking the script. I tried a few minor modifications but couldn’t fix it. Created an issue in the github site.
Remember to remove the environment: $ conda remove rad_haplotyper_env
I am now going to try microhaplot by Thomas Ng and Eric Anderson!!
Now, have to convert the bam files to sam files.
for bam in *.bam
do
echo $bam
samtools view -h -o ../sam_files/${bam%.bam}.sam $bam
done
for bam in *.bam
do
echo $bam
mv $bam ${bam%-RG.bam}.bam
done
for bam in *.bai
do
echo $bam
mv $bam ${bam%-RG.bam.bai}.bam.bai
done
for bam in *.bam
do
echo $bam
samtools idxstats $bam > ./idxstats/${bam%.bam}_idxstats.txt
done
for bam in *.bam
do
echo $bam
samtools view -h -o /Users/edc5240/Datasets/Ptuberculosa/sam_files/${bam%.bam}.sam $bam
done
Do the tutorial
library(microhaplot)
shiny_dir <- "/Volumes/GoogleDrive-103325533945714033178/My Drive/Ptuberculosa/Ptuberculosa_data/shiny_dir"
microhaplot::mvShinyHaplot(shiny_dir)
app.path <- file.path(shiny_dir, "microhaplot")
microhaplot::runShinyHaplot(app.path)
Then follow along here to get data into microhaplot.
library(microhaplot)
run.label <- "Ptuberculosa"
path <- "/Users/edc5240/Datasets/Ptuberculosa"
sam.path <- "/Users/edc5240/Datasets/Ptuberculosa/sam_files"
# untar(system.file("extdata",
# "sebastes_sam.tar.gz",
# package="microhaplot"),
# exdir = sam.path)
label.path <- file.path(path, "labels.txt")
vcf.path <- file.path(path, "mm0.9minq30mac3thin500.recode.vcf")
mvShinyHaplot(file.path(path,"shiny_dir"))
app.path <- file.path(path,"shiny_dir", "microhaplot")
haplo.read.tbl <- prepHaplotFiles(run.label = run.label,
sam.path = sam.path,
out.path = path,
label.path = label.path,
vcf.path = vcf.path,
app.path = app.path)
runShinyHaplot(path = "/Users/edc5240/Datasets/Ptuberculosa/shiny_dir/microhaplot")
Working with the data in microhaplot was OK, but I kept getting unexplained N’s in the output, and it was going to take some coding effort to translate the output into migrate format. Moreover, microhaplot only kept the variable sites, which isn’t ideal for migrate’s mutation model. So I am giving up on this front, and am going to try to generate haplotypes with Stacks.
Need first to copy all the fastq files over from Google Drive, where ’Ale’a put them. I split them into 4 tranches to keep the number of files under 50 as required by gdown. Note that the folder protocol won’t take urls, but just the ID.
gdown --id 1ES3yMzZS8JkWROiE1jdfTp6KBjxDdCXO --folder --remaining-ok --output firstbatch
gdown --id 1Ofta4RLcf6vddIiAA6dLCkdrYngADW1d --folder --remaining-ok --output secondbatch
gdown --id 1Og6JQNlhgprYvLTxJtMBlCCj72qRxLOw --folder --remaining-ok --output thirdbatch
gdown --id 1Ojnlj3L_8UFoPUVxaqvLKTLs8qq_S9ml --folder --remaining-ok --output fourthbatch
Need to rename the files into Illumina format in order for Stacks to recognize the paired reads
cd raw
for f in *.F.fq.gz
do
mv $f ${f/_1.F.fq.gz/_R1_001.fastq.gz}
done
for f in *.R.fq.gz
do
mv $f ${f/_1.R.fq.gz/_R2_001.fastq.gz}
done
cd ..
Hmm… gotta rename the cleaned files too
cd cleaned
for f in *001.1.fq.gz
do
mv $f ${f/_R[12]_001.1.fq.gz/.1.fq.gz}
done
for f in *001.2.fq.gz
do
mv $f ${f/_R[12]_001.2.fq.gz/.2.fq.gz}
done
I’m going to use the denovo_map.pl pipeline. This command will take
data from ./cleaned and the popmap I created for dDocent
and -m 3 reads per stack, -n 4 distance between stacks, -M 4 distance
between catalog loci. Running on 12 -T threads and only keeping loci
that appear in 75% of individuals in all 10 populations
denovo_map.pl --samples ./cleaned/ --popmap ../popmap --out-path ./pipeline --paired \
-m 3 -n 4 -M 4 -T 12 -r 75 -p 10 -X "populations: --fasta-samples" -X "populations: --filter-haplotype-wise"
Copy it down
scp -r ecrandall@128.118.123.64:eric_data/ptuberculosa/stacks/pipeline/output1 ./stacks_output
Now I need to re-run populations to get more statistics.
Original populations output is in output1, this will be in output2.
populations -P ../ -O ./ -M ../../../popmap -t 12 -p 10 -r 75 -H --hwe --fstats --p-value-cutoff 0.99 --fasta-loci --fasta-samples --vcf --genepop --structure --treemix --fasta-samples-raw
Paul Maier created this script to convert Stacks haplotypes to migrate format. I had to remove all the [individualName] tokens from the populations.samples.fa to get it to work.
python2 /Applications/migrate/migrate-4.4.4/fasta2genotype/fasta2genotype.py populations.samples2.fa NA ../popmap.tsv populations.snps.vcf ptuberculosa.mig
###################################################################
### ###
### Fasta2Genotype | Data Conversion | Version 1.10 ###
### ###
### Cite as follows: ###
### ###
### Maier P.A., Vandergast A.G., Ostoja S.M., Aguilar A., ###
### Bohonak A.J. (2019). Pleistocene glacial cycles drove ###
### lineage diversification and fusion in the Yosemite toad ###
### (Anaxyrus canorus). Evolution, in press. ###
### https://www.doi.org/10.1111/evo.13868 ###
### ###
###################################################################
Output type? [1] Migrate [2] Arlequin [3] DIYABC [4] LFMM [5] Phylip [6] G-Phocs [7] Treemix [8] Haplotype: 1
Remove restriction enzyme or adapter sequences? These may bias data. [1] Yes [2] No: 2
Coverage Cutoff (number reads for locus)? Use '0' to ignore coverage: 0
Remove monomorphic loci? [1] Yes [2] No: 2
Remove loci with excess heterozygosity? This can remove paralogs. [1] Yes [2] No: 1
Maximum heterozygosity cutoff for removing loci out of Hardy-Weinberg? 0.5
Filter for allele frequency? False alleles might bias data. [1] Yes [2] No: 2
Filter for missing genotypes? These might bias data. [1] Yes [2] No: 2
**************************************************************************************************************
*** ... BEGINNING CONVERSION ... ***
**************************************************************************************************************
Cataloging loci...
Counting locus lengths...
Cataloging populations...
Counting gene copies...
Counting alleles for each locus...
Identifying loci with excess heterozygosity...
Calculating observed heterozygosity and homozygosity...
Calculating expected heterozygosity and homozygosity...
Flagging loci with excess heterozygosity for removal...
Removing loci...
Removed 2 overly heterozygous loci.
Outputting migrate-n file...
*** DONE! ***
Then I manually re-ordered the populations to run from north to south in the output file, and moved it into the migrate folder.
Install the parallel version of Migrate on Nautilus server. I hade to
remove -fvectorize from line 100 of the makefile to get
this to work.
curl https://peterbeerli.com/migrate-html5/download_version4/migrate-newest-src.tar.gz > migrate.tar.gz
tar -xvf migrate.tar.gz
cd migrate-4.4.4/
cd src
./configure
make mpis (for parallel running)
sudo cp migrate-n-mpi /usr/local/bin/migrate-n-mpi
These RAD loci are much less variable than COI that I’m used to
using. I need to figure out an overall mutation model to use with RAD
loci. I can’t find any discussion
of this issue on the Migrate Google Group, so I
created one. I guess I’ll use modelTest in the phangorn
package to see where that gets me.
I renamed the FASTA headers in populations.samples.fa
with BBEdit:
Find: >CLocus_\d+_Sample_\d+_Locus_(\d+)_Allele_([01]) \[(.+)\]
Replace: >\3_L\1_A\2
Lets load in the data and calculate some statistics for each locus.
Previously Migrate-n only implemented the F84 (=HKY) model, with two
rates (Transistions and Transversions) and gamma distributed rate
variability. The new v4 has a datamodel parameter that
suggests it might take other models of molecular evolution, but the
possible models are not listed in the documentation! So I will just fit
an HKY model.
note the interface lists these possible models: 1 Jukes-Cantor model 2 Kimura 2-parameter model 3 Felsenstein 1981 (F81) model 4 Felsenstein 1984 (F84) model 5 Hasegawa-Kishino-Yano model 6 Tamura-Nei model
fastadata <- read.FASTA("/Volumes/GoogleDrive-103325533945714033178/My Drive/Ptuberculosa/stacks_output/allpops75_hwe/populations.samples_rename.fasta")
# have to read in the locus lengths, because this is the only way to make sure the stats
# and the migrate infile are ordered the same
migrate_lengths <- read_lines("/Volumes/GoogleDrive-103325533945714033178/My Drive/Ptuberculosa/stacks_output/ptuberculosa.mig", skip = 1, n_max=1) %>%
str_split("\\t", simplify = T) %>% t(.) %>% tibble(length=.)
# Get locus names. fasta2genotype orders them alphabetically, so do this here too
locus_names <- str_extract(names(fastadata),"L\\d+") %>% unique() %>% sort()
stats <- tibble(locus = character(),
length = numeric(),
segSites = numeric(),
nHaps = numeric(),
nucDiv = numeric(),
ttRatio = numeric(),
model = character(),
gammaShape = numeric(),
rate1 = numeric(),
rate2 = numeric(),
rate3 = numeric(),
rate4 = numeric(),
Q1= numeric(),
Q2 = numeric(),
Q3 = numeric(),
)
for(l in locus_names){
print(paste("Now Doing Locus", l, match(l, locus_names), "out of", length(locus_names)))
locus_dnabin <- fastadata[str_which(names(fastadata),pattern = l)]
# convert to package formats
locus_dnabin <- as.matrix(locus_dnabin)
locus_gtypes <- sequence2gtypes(locus_dnabin)
locus_phy <- phyDat(locus_dnabin)
#create a haplotype network .. to be continued
haps <- haplotype(locus_dnabin)
nhaps <- length(dimnames(haps)[[1]])
#tcs <- haploNet(haps)
#find parameters of HKY (F84) model
modeltest <- modelTest(locus_phy, model = c("JC","K80", "F81", "HKY","TrN"),
G = T, I = F, k = 4, mc.cores = 4)
# pick out the best model. If multiple models are tied for best, pick the simplest one
bestmodel <- modeltest$Model[which(modeltest$BIC == min(modeltest$BIC))][1]
#open the object environment
env <- attr(modeltest,"env")
bestparams <- get(bestmodel, env)
bestparams <- eval(bestparams, env=env)
# use this code for v3, which only has F84 (HKY)
#HKY <- modelTest(locus_phy, model = c("HKY"),
# G = T, I = F, k = 4)
#env <- attr(HKY, "env")
#HKYG <- get("HKY+G", env)
#model <- eval(HKYG, env=env)
# calculate TiTv Ratio
ttratio <- TiTvRatio(locus_gtypes)
stats <- bind_rows(stats, tibble(locus=l,
length = length(locus_dnabin[1,]),
segSites = length(seg.sites(locus_dnabin)),
nHaps = length(dimnames(haps)[[1]]),
nucDiv = nuc.div(locus_dnabin),
ttRatio = ttratio[3],
model = bestmodel,
gammaShape = bestparams$shape,
rate1 = bestparams$g[1],
rate2 = bestparams$g[2],
rate3 = bestparams$g[3],
rate4 = bestparams$g[4],
Q1 = sort(unique(bestparams$Q))[1],
Q2 = sort(unique(bestparams$Q))[2],
Q3 = sort(unique(bestparams$Q))[3]
))
}
stats <- stats[which(stats$length %in% migrate_lengths$length),]
#setdiff(stats$length, as.numeric(migrate_lengths$length))
# write_csv(stats, "./migrate/run4_locus_statistics.csv")
Write a space delimited textfile that can be added to the migrate data file in the format that Peter suggested.
stats <- read_csv("./migrate/run4/run4_locus_statistics.csv")
## Rows: 109 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, model
## dbl (9): length, segSites, nHaps, nucDiv, ttRatio, gammaShape, rate1, Q1, Q2
## lgl (4): rate2, rate3, rate4, Q3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# write a space delimited textfile that can be added to the migrate data file
# following Peter's suggestion here:
#https://groups.google.com/g/migrate-support/c/XjV_4jZW4RI/m/HbRWoGY6AwAJ
migrate_mutation_models <- tibble(prefix = "#$",
locus=1:length(stats$locus),
type = "s",
model = stats$model,
q1 = stats$Q2,
q2 = stats$Q3)
#write_delim(migrate_mutation_models,"./migrate/run4_modelblock.txt", na="", delim = " ")
stats
## # A tibble: 109 × 15
## locus length segSites nHaps nucDiv ttRatio model gammaShape rate1 rate2
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <lgl>
## 1 L10061 411 1 2 0.0000280 Inf F81 1 1 NA
## 2 L1096075 388 7 8 0.000234 6 K80 1 1 NA
## 3 L11008 472 2 6 0 1 JC 1 1 NA
## 4 L11074 468 2 3 0.0000491 0 JC 1 1 NA
## 5 L11315 499 4 5 0.000162 1 F81 1 1 NA
## 6 L11796 516 1 2 0.0000239 Inf F81 1 1 NA
## 7 L1180 496 5 6 0.000158 1.5 F81 1 1 NA
## 8 L12220 337 2 4 0.0000346 Inf F81 1 1 NA
## 9 L1257 512 3 9 0 0.5 JC 1 1 NA
## 10 L12728 618 1 2 0.0000180 0 JC 1 1 NA
## # ℹ 99 more rows
## # ℹ 5 more variables: rate3 <lgl>, rate4 <lgl>, Q1 <dbl>, Q2 <dbl>, Q3 <lgl>
So we have a 16 invariant loci, and the mean overall transition:transversion ratio is 0.8073394. Mean gamma shape parameter is 1, which argues for only one rate.
I went through and compared my 3.x parmfile line by line to the default one generated by Migrate 4.4.4, and came up with this
################################################################################
# Parmfile for Migrate 4.4.4(git:v4-series-26-ge85c6ff)-June-1-2019 [do not remove these first TWO lines]
# generated automatically on
# Fri Feb 4 2022
menu=NO
nmlength=10
datatype=SequenceData
datamodel=HKY
ttratio=1.000000
freqs-from-data=YES
seqerror-rate={0.0001,0.0001,0.0001,0.0001}
categories=1 #no categories file specified
rates=1: 1.000000
prob-rates=1: 1.000000
autocorrelation=NO
weights=NO
recover=NO
fast-likelihood=NO
inheritance-scalars={1.00000000000000000000}
haplotyping=YES:no-report
population-relabel={1 2 3 4 5 6 7 8 9}
infile=../../../ptuberculosa.mig
random-seed=AUTO #OWN:410568459
title= Palythoa tuberculosa - Hawaii
progress=YES
logfile=YES:logfile.txt
print-data=NO
outfile=outfile.txt
pdf-outfile=outfile.pdf
pdf-terse=YES
use-M=YES
print-tree=NONE
mig-histogram=MIGRATIONEVENTSONLY
skyline=NO #needs mig-histogram=ALL:...
theta=PRIOR:50
migration=PRIOR:10
rate=PRIOR:50
split=PRIOR:10
splitstd=PRIOR:10
mutation=CONSTANT
analyze-loci=A
divergence-distrib=E
custom-migration={
* 0 0 0 0 d 0 0 0
0 * * 0 0 0 0 0 0
0 * * * 0 0 0 0 0
0 0 * * * 0 0 0 0
0 0 0 * * * 0 0 0
0 0 0 0 * * * 0 0
0 0 0 0 0 * * * 0
0 0 0 0 0 0 * * *
0 0 0 0 0 0 0 * *
}
geo=NO
updatefreq=0.200000 0.200000 0.200000 0.200000 #tree, parameter haplotype, timeparam updates
bayes-posteriorbins= 2000 2000
bayes-posteriormaxtype=TOTAL
bayes-file=YES:bayesfile
bayes-allfile=YES:bayesallfile
bayes-all-posteriors=YES:bayesallposterior
bayes-proposals= THETA METROPOLIS-HASTINGS Sampler
bayes-proposals= MIG SLICE Sampler
bayes-proposals= DIVERGENCE METROPOLIS-HASTINGS Sampler
bayes-proposals= DIVERGENCESTD METROPOLIS-HASTINGS Sampler
bayes-priors= THETA WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
bayes-priors= MIG WEXPPRIOR: 0.000100 1000.000000 10000 100
bayes-priors= SPLIT * * WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
bayes-priors= SPLITSTD * * WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
bayes-hyperpriors=NO
long-chains=1
long-inc=100
long-sample=10000
burn-in=2000
auto-tune=YES:0.440000
assign=NO
heating=YES:1:{1.000000,1.500000,3.000000,1000000.000000}
heated-swap=YES
moving-steps=NO
gelman-convergence=No
replicate=NO
end
Let’s see what happens! Started this run on Jan 22, at 10pm.
screen -S migrate_testrun
mpirun -np 32 ~/migrate-4.4.4/src/migrate-n-mpi parmfile
And it was finished the next morning! Prior on m was too wide, giving results like this:
No
bueno.
Revised the m prior like so (I had no window value in the original!) Order of values is min, mean, maximum, proposal window.
bayes-priors= MIG WEXPPRIOR: 0.000100 1000.000000 10000 100
A few modifications to make a panmictic model
population-relabel={1 1 1 1 1 1 1 1 1 1}
#and
custom-migration={
*
}
Change all parameter values to m for island model.
custom-migration={
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
}
Minor github issue. Got a little hung up at this point because I
accidentally committed some bayesallfiles that were hundreds of Mb, and
of course github wouldn’t let me upload those. Thought I fixed it with
these
instructions. Then ended up trying git-filter-repo
(installed via homebrew), then ended up restoring the whole thing from
the github repository. Ouch. Then it wouldn’t push until I ran
git prune
Very nice! 😃 Going to need to modify my code to read multilocus output from migrate-n.
workingDir <- "./migrate/run2"
modelMarglikes <- harvest.model.likelihoods(workingDir = workingDir)
## [1] "./migrate/run2/island"
## [1] "./migrate/run2/panmixia"
## [1] "./migrate/run2/stepping.stone"
kable(modelMarglikes, format.args = list(digits = 5))
| model | thermodynamic | bezier.corrected | harmonic |
|---|---|---|---|
| island | -71972 | -71120 | -71813 |
| panmixia | -70400 | -69492 | -69655 |
| stepping.stone | -65995 | -65136 | -65790 |
results <- bfcalcs(modelMarglikes)
kable(results)
| model | thermodynamic | bezier.corrected | harmonic | lbf | choice | modelprob |
|---|---|---|---|---|---|---|
| island | -71971.81 | -71120.19 | -71812.53 | -11967.82 | 3 | 0 |
| panmixia | -70400.30 | -69492.18 | -69654.60 | -8711.80 | 2 | 0 |
| stepping.stone | -65995.43 | -65136.28 | -65789.54 | 0.00 | 1 | 1 |
workingDir2 <- "/Volumes/GoogleDrive-103325533945714033178/My Drive/Ptuberculosa/stacks_output/migrate/testrun/run2"
winningModelDir <- file.path(workingDir2, "stepping.stone")
#this may take a minute or two
data <- read.table(file.path(winningModelDir,"bayesallfile"), header=T)
#calculate migrants per generation
data.nm <- migrants.per.gen(data)
data %>% group_by(locus) %>% summarize()
# Split the whole list into the individual replicates, so that you will
# have a list of data frames
#data.list.1 <- split(data.nm,data$Replicate)
#split the data on locus
data.list.1 <- split(data.nm,data$Locus)
data.list.mcmc <- mcmc.list(lapply(data.list.1,mcmc))
# Remove burnin if it wasn't removed in the run, where burnin is a proportion of the total.
# e.g. First 40% First 20,000 samples out of 50,000 or 10 million steps out of 25 million
#burnin <- 0.2
#data.list.1<-lapply(data.list.1,subset,subset=Steps>(burnin*max(data.list.1[[1]]$Steps)))
data.list.mcmc <- mcmc.list(data.list.1)
#convert each dataframe to an mcmc object and then convert the whole thing to an mcmc list
#collapse all replicates to one for purposes of highest posterior density
data.list.allinone <- mcmc(data=do.call("rbind",data.list.2))
summ <- summary(data.list.mcmc)
ess <- effectiveSize(data.list.mcmc)
gelman <- gelman.diag(data.list.mcmc,multivariate=F)
HPD <- HPDinterval(data.list.allinone)
#concatenate the stats
allstats<-cbind(summ$statistics[,1:2],HPD,ess,gelman$psrf)
#write.csv(allstats,file="codastats.csv")
’Ale’a and I settled on the following models
#Panmixia (panmixia; 1 panmictic population)
population-relabel={1 1 1 1 1 1 1 1 1 1}
custom-migration= *
#n-Island (island; all populations same size, all exchange migrants)
population-relabel={1 2 3 4 5 6 7 8 9}
custom-migration=
Pop_Kure m m m m m m m
Pop_P&H m m m m m m m
Pop_MaroReef m m m m m m m
Pop_FFS m m m m m m m
Pop_Kauai m m m m m m m
Pop_Oahu m m m m m m m
Pop_Molokai m m m m m m m
Pop_Maui m m m m m m m
Pop_BigIsland m m m m m m m
#Stepping-Stone (stepping.stone)
population-relabel={1 2 3 4 5 6 7 8 9}
Pop_Kure * * 0 0 0 0 0 0 0
Pop_P&H * * 0 0 0 0 0 0 0
Pop_MaroReef 0 * * * 0 0 0 0 0
Pop_FFS 0 0 * * * 0 0 0 0
Pop_Kauai 0 0 0 * * * 0 0 0
Pop_Oahu 0 0 0 0 * * * 0 0
Pop_Molokai 0 0 0 0 0 * * * 0
Pop_Maui 0 0 0 0 0 0 * * *
Pop_BigIsland 0 0 0 0 0 0 0 * *
#Stepping-Stone with a Kure-Oahu connection (stepping.stone.KO)
population-relabel={1 2 3 4 5 1 7 8 9}
custom-migration=
Pop_Kure * * 0 0 0 0 * 0 0
Pop_P&H * * 0 0 0 0 0 0 0
Pop_MaroReef 0 * * * 0 0 0 0 0
Pop_FFS 0 0 * * * 0 0 0 0
Pop_Kauai 0 0 0 * * * 0 0 0
Pop_Oahu * 0 0 0 * * * 0 0
Pop_Molokai 0 0 0 0 0 * * * 0
Pop_Maui 0 0 0 0 0 0 * * *
Pop_BigIsland 0 0 0 0 0 0 0 * *
#NWHI vs MHI (NWHI_MHI; two panmictic regions)
population-relabel={1 1 1 1 2 2 2 2 2}
custom-migration=
Pop_NWHI * *
Pop_MHI * *
# Stepping Stone Breaks (stepping.stones.breaks;
# regional groups of Kure_PH vs MR_FFS vs MHI as lumped populations
# with stepping stone gene flow between them. )
population-relabel={1 1 2 2 3 4 5 6 7}
custom-migration=
Pop_Kure_PH * * 0 0 0 0 0
Pop_MaroReef_FFS * * * 0 0 0 0
Pop_Kauai 0 * * * 0 0 0
Pop_Oahu 0 0 * * * 0 0
Pop_Molokai 0 0 0 * * * 0
Pop_Maui 0 0 0 0 * * *
Pop_BigIsland 0 0 0 0 0 * *
# Stepping Stone Breaks (stepping.stones.breaks;
# regional groups of Kure_PH vs MR_FFS vs MHI as lumped populations
# with stepping stone gene flow between them, and a Kure-Oahu connection.)
population-relabel={1 1 2 2 3 4 5 6 7}
custom-migration=
Pop_Kure_PH * * 0 * 0 0 0
Pop_MaroReef_FFS * * * 0 0 0 0
Pop_Kauai 0 * * * 0 0 0
Pop_Oahu * 0 * * * 0 0
Pop_Molokai 0 0 0 * * * 0
Pop_Maui 0 0 0 0 * * *
Pop_BigIsland 0 0 0 0 0 * *
I also turned menu = NO, and replicate=YES:3.
So we will do 10 replicates of 3 replicates. This will start at r1, and run all models for that before moving on. Pretty sure this will finish one whole model before moving on to the next one (since all threads are being used for different loci)
#!
for r in */
do
cd $r
echo $r
date
date > date.txt
for m in */
do
cd $m
date > date.txt
echo $m
date
mpirun -np 32 ~/migrate-4.4.4/src/migrate-n-mpi parmfile
sleep 1
cd ..
done
cd ..
done
Copy it up, and then make a total of 10 replicate copies of the folder.
scp -r ./run4 ecrandall@nautilus.psu.edu:./Ptuberculosa/run4/r1
#once on server, copy it 9 times
for a in $(seq 2 10); do cp -r rep1 rep$a; done
And ten replicates finished in about 10 days! Found this guide to download everything except the massive bayesallfiles and pdfs
rsync -av -e ssh --exclude='bayes*' --exclude="*.pdf" ecrandall@nautilus.psu.edu:~/Ptuberculosa/run4 ./
rsync -av -e ssh --exclude='bayes*' ecrandall@nautilus.psu.edu:~/Ptuberculosa/run4/rep1/stepping.stone.all ./stepping.stone.KO.all
runDir <- "/Volumes/GoogleDrive/My Drive/ptuberculosa/migrate/run4"
#runDir <-"/Volumes/GoogleDrive-103325533945714033178/My Drive/ptuberculosa/migrate/run4"
likelist <- list()
for(r in 1:10){
rep = paste0("rep",r)
print(rep)
likelist[[rep]] <- harvest.model.likelihoods(workingDir=file.path(runDir,rep))
}
# Model selection for each replicate...
likelist %>% map(bfcalcs)
The final model marginal likelihood estimates based on the mean of 10 replicates:
like.df <- likelist %>% bind_rows() %>% group_by(model)
means <- like.df %>% summarize(bezier.corrected = mean(bezier.corrected))
final_model <- bfcalcs(means)
final_model
## [1] model bezier.corrected lbf choice
## [5] modelprob
## <0 rows> (or 0-length row.names)
Difference between stepping.stone and stepping.stone.KO is very significant. 🙋
top.choice <- final_model$model[which(final_model$choice ==1)]
second.choice <- final_model$model[which(final_model$choice ==2)]
permTS(like.df$bezier.corrected[which(like.df$model == top.choice)],
like.df$bezier.corrected[which(like.df$model == second.choice)],
alternative = "greater", method = "exact.ce")
There is an intriguing result of nearly one-way gene flow from Oahu to Kure. Also, the stepping.stone.breaks.KO is much better than just stepping.stone.breaks. This suggests that a divergence parameter for these two populations is in order.
Stepping.stone.KO.oneway
#Stepping-Stone with a *one way* Kure-Oahu ongoing gene flow connection
# (stepping.stone.KO.oneway)
population-relabel={1 2 3 4 5 1 7 8 9}
custom-migration={
* * 0 0 0 * 0 0 0
* * * 0 0 0 0 0 0
0 * * * 0 0 0 0 0
0 0 * * * 0 0 0 0
0 0 0 * * * 0 0 0
* 0 0 0 * * * 0 0
0 0 0 0 0 * * * 0
0 0 0 0 0 0 * * *
0 0 0 0 0 0 0 * *
}
Going to add more replicates of stepping.stone.KO and stepping.stone.KO.oneway
for a in $(seq 12 30); do cp -r rep11 rep$a; done
rsync -av -e ssh --exclude='bayes*' --exclude="*.pdf" ecrandall@nautilus.psu.edu:~/Ptuberculosa/run4.2 ./
runDir <- "/Volumes/GoogleDrive/My Drive/ptuberculosa/migrate/run4"
#runDir <- "/Volumes/GoogleDrive-103325533945714033178/My Drive/ptuberculosa/migrate/run4"
likelist <- list()
for(r in 1:30){
rep = paste0("rep",r)
print(rep)
likelist[[rep]] <- harvest.model.likelihoods(workingDir=file.path(runDir,rep))
}
# Model selection for each replicate...
#likelist %>% map(bfcalcs)
The final model marginal likelihood estimates based on the mean of 30 replicates:
like.df <- likelist %>% bind_rows() %>% group_by(model)
means <- like.df %>% summarize(bezier.corrected = mean(bezier.corrected))
final_model <- bfcalcs(means)
final_model
## [1] model bezier.corrected lbf choice
## [5] modelprob
## <0 rows> (or 0-length row.names)
This time stepping stone with two-way Kure-Oahu connection beats out the one way connection
Peter says you can mix * with d parameters, and I tested it and it worked! Based on these results I’m going to add a model of one way ongoing migration from Oahu to Kure, and another model of divergence from Oahu to Kure about 80 years ago with no subsequent gene flow, and finally, divergence plus migration between Oahu to Kure.
Equilibrium Gene Flow, but divergence with gene flow from Oahu to Kure
#Stepping-Stone with a *one way* Kure-Oahu divergence with gene flow
# (stepping.stone.KO.divmig)
population-relabel={1 2 3 4 5 1 7 8 9}
custom-migration=
Pop_Kure * 0 0 0 0 0 D 0 0
Pop_P&H 0 * 0 0 0 0 0 0 0
Pop_MaroReef 0 * * * 0 0 0 0 0
Pop_FFS 0 0 * * * 0 0 0 0
Pop_Kauai 0 0 0 * * * 0 0 0
Pop_Oahu 0 0 0 0 * * * 0 0
Pop_Molokai 0 0 0 0 0 * * * 0
Pop_Maui 0 0 0 0 0 0 * * *
Pop_BigIsland 0 0 0 0 0 0 0 * *
Equilibrium Gene Flow, but divergence without gene flow from Oahu to Kure
#Stepping-Stone with a *one way* Kure-Oahu divergence without gene flow
# (stepping.stone.KO.div)
population-relabel={1 2 3 4 5 1 7 8 9}
custom-migration=
Pop_Kure * 0 0 0 0 0 d 0 0
Pop_P&H 0 * 0 0 0 0 0 0 0
Pop_MaroReef 0 * * * 0 0 0 0 0
Pop_FFS 0 0 * * * 0 0 0 0
Pop_Kauai 0 0 0 * * * 0 0 0
Pop_Oahu 0 0 0 0 * * * 0 0
Pop_Molokai 0 0 0 0 0 * * * 0
Pop_Maui 0 0 0 0 0 0 * * *
Pop_BigIsland 0 0 0 0 0 0 0 * *
North to South Divergence with migration
* 0 0 0 0 0 0 0 0
D * 0 0 0 0 0 0 0
0 D * 0 0 0 0 0 0
0 0 D * 0 0 0 0 0
0 0 0 D * 0 0 0 0
0 0 0 0 D * 0 0 0
0 0 0 0 0 D * 0 0
0 0 0 0 0 0 D * 0
0 0 0 0 0 0 0 D *
North to South Divergence without migration
* 0 0 0 0 0 0 0 0
d * 0 0 0 0 0 0 0
0 d * 0 0 0 0 0 0
0 0 d * 0 0 0 0 0
0 0 0 d * 0 0 0 0
0 0 0 0 d * 0 0 0
0 0 0 0 0 d * 0 0
0 0 0 0 0 0 d * 0
0 0 0 0 0 0 0 d *
Here, population 1 isn’t born until it splits from population 6. I’m having trouble setting an individual prior for a very young divergence from 6 to 1- apparently I found a bug- Peter is fixing it.
* 0 0 0 0 d 0 0 0
0 * 0 0 0 0 0 0 0
0 D * 0 0 0 0 0 0
0 0 D * 0 0 0 0 0
0 0 0 D * 0 0 0 0
0 0 0 0 D * 0 0 0
0 0 0 0 0 D * 0 0
0 0 0 0 0 0 D * 0
0 0 0 0 0 0 0 D *
And we need to add reasonable priors on divergence time to match the foundation of Naval Air Station Midway in 1941, with a standard deviation of say 20 years because it was placed under Navy control in 1903. Splitting times \(\tau\) in Migrate are in units of \(\mu\) x generations. I will assume a nuclear mutation rate of 1e-9 and a generation time of 1 year. Peter confirmed
#take the mean of the modes from one replicate of the stepping.stone.KO model
#meantheta <- mean(0.00570+0.00450+0.00450+0.00490+0.00630+0.00510+0.00570+0.00510+0.00590)
mu <- 1e-9
generation_time <- 1
mean_split <- 80*mu/generation_time
std_split <- 20*mu/generation_time
max_split <- 163 * mu/generation_time
80 years ago is 8^{-8}, and a max of 163 years (Midway discovered in 1859) which is 1.63^{-7}.
It turns out that setting individual priors on divergence isn’t working right now, so I’m just going to have to set a wide prior if there are other d or D parameters and only can set the specific prior if the Kure Oahu split is the only split in the model.
This results in adding the following two lines to the parmfiles with only the Kure Oahu split.
bayes-priors= SPLIT * * WEXPPRIOR: 0.0 8e-8 1.63e-7 1.63e-8
bayes-priors= SPLITSTD * * WEXPPRIOR: 0.0 8e-8 1.63e-7 1.63e-8
Unfortunately the prior above was resulting in error messages.
[1] 63427 illegal hardware instruction /Applications/migrate/migrate-4.4.4/migrate-n
From the output, it looks like it can’t consider priors smaller than 1e-6. But even using 1e-5 or 1-e4 results in errors.
So all will use this wider prior. Also, I am using exponentially distributed priors, so the SPLITSTD is not going to be used.
bayes-priors= SPLIT * * WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
bayes-priors= SPLITSTD * * WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
Copy it up, and then make a total of 10 replicate copies of the folder.
scp -r ./run4.1 ecrandall@nautilus.psu.edu:./Ptuberculosa/run4.1
mkdir rep1
mv step* rep1
cp ../run4/ptuberculosa.mig ./ptuberculosa.mig
cp ../run4/mpi_run_migrate-n.sh ./mpi_run_migrate-n.sh
cd rep1
#once on server, copy it 9 times
for a in $(seq 2 10); do cp -r rep1 rep$a; done
rsync -av -e ssh --exclude='bayes*' ecrandall@nautilus.psu.edu:~/Ptuberculosa/run4.5 ./
rsync -av -e ssh --exclude='bayes*' --exclude="*.pdf" eric@moneta.tobolab.org:~/Palythoa/run4.6 ./run4.6
# then copied them into the run4 folder with:
# the exclude statement excludes stepping.stone.KOd and stepping.stone.KO.D which didn't complete.
rsync -av --progress run4.5/ run4/ --exclude '*KO[\.d]*'
rsync -av --progress run4.6/ run4/
#runDir <- "/Volumes/GoogleDrive/My Drive/ptuberculosa/migrate/run4"
runDir <-"/Users/eric/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Research/Ptuberculosa/migrate/run4"
likelist <- list()
for(r in 1:30){
rep = paste0("rep",r)
print(rep)
likelist[[rep]] <- harvest.model.likelihoods(workingDir=file.path(runDir,rep))
}
# Model selection for each replicate...
likelist %>% map(bfcalcs)
The final model marginal likelihood estimates based on the mean of 10 replicates:
like.df <- likelist %>% bind_rows() %>% group_by(model)
means <- like.df %>% summarize(bezier.corrected = mean(bezier.corrected))
final_model <- bfcalcs(means)
final_model
## model bezier.corrected lbf choice modelprob
## 1 NWHI_MHI -72281.30 -35630.64 9 0
## 2 island -75311.81 -41691.66 10 0
## 3 panmixia -72172.88 -35413.80 8 0
## 4 stepping.stone -71256.23 -33580.50 7 0
## 5 stepping.stone.KO -69261.69 -29591.42 3 0
## 6 stepping.stone.KO.D -55474.16 -2016.36 2 0
## 7 stepping.stone.KO.oneway -69296.21 -29660.46 4 0
## 8 stepping.stone.KOd -54465.98 0.00 1 1
## 9 stepping.stone.breaks -71110.23 -33288.51 6 0
## 10 stepping.stone.breaks.KO -70307.17 -31682.39 5 0
## 11 stepping.stone.splits.d -77896.08 -46860.21 13 0
## 12 stepping.stone.splitsD -77091.94 -45251.93 12 0
## 13 stepping.stone.splitsD.KOsplitd -76884.10 -44836.24 11 0
The two best models are equilibrium models across the archipelago, with recent divergence from Oahu to Kure. The best one is divergence without gene flow, followed by divergence with gene flow. The difference between the top two models is significant. 😀
top.choice <- final_model$model[which(final_model$choice ==1)]
second.choice <- final_model$model[which(final_model$choice ==2)]
TS <- permTS(like.df$bezier.corrected[which(like.df$model == top.choice)],
like.df$bezier.corrected[which(like.df$model == second.choice)],
alternative = "greater")
TS
And some figures summarizing all this. The two best models are equilibrium models across the archipelago, with recent divergence from Oahu to Kure. The best one is divergence without gene flow, followed by divergence with gene flow. The three models with divergences are much worse than those without. Difference between stepping.stone and stepping.stone.KO is very significant.
models <- c("N-Island", "NWHI vs. MHI"," Panmixia", "SS: Islands",
"SS: Regional Breaks", "SS: Regional Breaks, KO 2way",
"SS: Islands, OK 2way",
"SS: Islands, O2K Colonization with GF",
"SS: Islands, O2K 1 way",
"SS: Islands, O2K Colonization without GF",
"Seq Div: No GF","Seq Div: GF",
"Seq Div: GF, O2K Colonization without GF")
likesPlot <- likelist %>% bind_rows() %>% group_by(model) %>%
ggplot(mapping = aes(x=model, y = bezier.corrected)) +
geom_violin(draw_quantiles = 0.5) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_x_discrete(labels = models) +
labs(x = "Metapopulation Model",
y = "Bezier Corrected Marginal Likelihood")
likesPlot
Zoom in on the top two models
likesPlot_zoom <- likesPlot + ylim(-69400, -54000) + geom_point()
likesPlot_zoom
## Warning: Removed 92 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 92 rows containing missing values (`geom_point()`).
Sigh. I figured out that I had a ridiculously high Ti:Tv rate (\(\kappa\)) for 3 loci that had K80 models. This might have affected the summed marginal likelihood of each model just a bit, and definitely could affect parameter estimation, so I am going to rerun the three top models (stepping.stone.KOd, stepping.stone.KO.D, stepping.stone.KO) using a new version of migrate that doesn’t have bayesfile visualization issues, and with the ti:tv ratio fixed on those three loci.
rsync -av -e ssh --exclude='bayesfile*' --exclude="*.pdf" eric@threadripper.tobolab.org:~/Palythoa/run4.71 ./run4.7_full
runDir <-"/Users/eric/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Research/Ptuberculosa/migrate/run4.7"
likelist <- list()
for(r in 1:10){
rep = paste0("rep",r)
print(rep)
likelist[[rep]] <- harvest.model.likelihoods(workingDir=file.path(runDir,rep))
}
# Model selection for each replicate...
likelist %>% map(bfcalcs)
The final model marginal likelihood estimates based on the mean of 10 replicates:
like.df <- likelist %>% bind_rows() %>% group_by(model)
means <- like.df %>% summarize(bezier.corrected = mean(bezier.corrected))
final_model <- bfcalcs(means)
final_model
## model bezier.corrected lbf choice modelprob
## 1 stepping.stone.KO -65211.06 -34149.632 3 0.000000e+00
## 2 stepping.stone.KO.D -48214.14 -155.794 2 1.478301e-34
## 3 stepping.stone.KOd -48136.25 0.000 1 1.000000e+00
The two best models are equilibrium models across the archipelago, with recent divergence from Oahu to Kure. The best one is divergence without gene flow, followed by divergence with gene flow. The difference between the top two models is significant. 😀
top.choice <- final_model$model[which(final_model$choice ==1)]
second.choice <- final_model$model[which(final_model$choice ==2)]
TS <- permTS(like.df$bezier.corrected[which(like.df$model == top.choice)],
like.df$bezier.corrected[which(like.df$model == second.choice)],
alternative = "greater")
TS
And some figures summarizing all this. The two best models are equilibrium models across the archipelago, with recent divergence from Oahu to Kure. The best one is divergence without gene flow, followed by divergence with gene flow. The three models with divergences are much worse than those without. Difference between stepping.stone and stepping.stone.KO is very significant.
models <- c("SS: Islands, OK 2way",
"SS: Islands, O2K Colonization with GF",
"SS: Islands, O2K Colonization without GF")
likesPlot <- likelist %>% bind_rows() %>% group_by(model) %>%
ggplot(mapping = aes(x=model, y = bezier.corrected)) +
geom_violin(draw_quantiles = 0.5) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_x_discrete(labels = models) +
labs(x = "Metapopulation Model",
y = "Bezier Corrected Marginal Likelihood") +
ylim(-50000,-45000)
likesPlot
## Warning: Removed 5 rows containing non-finite values (`stat_ydensity()`).
winningModel <- "stepping.stone.KO.D"
Because I set such wide priors on gene flow for marine species, the standard PDF output does a poor job of binning the data, such that even though an equilibrium model of gene flow is the best model, Migrate is telling me that all migration parameters have a mode of 2.5 and 0 at the 25th percentile. It would be nice if I could look at the posterior calculated across all loci myself, but it is unclear how Migrate does this exactly. I inquired about this to Peter, and he got back to me with this method for summing the posterior over all loci and some Python code as a demo.
- create a histogram for each locus
- log the histogram (there will be issues with zero value cells) *. migrate smoothes each locus histogram
- subtract the log(prior)
- sum all log histograms for each locus
- add the log prior back
- convert to non-log
- adjust so that the sum of the new histogram sums t0 1.0
#
Dir <- "/Users/eric/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Research/Ptuberculosa/migrate/run4.7"
winningModel <- "stepping.stone.KO.D"
posteriors <- list()
#read in the posterior for one replicate this may take 5 minutes
for(r in 1:2){
print(paste("Loading Rep",r))
posteriors[[r]] <- read.table(file.path(Dir,paste0("rep",r),winningModel,
"bayesallfile"), header=T)
}
## [1] "Loading Rep 1"
## [1] "Loading Rep 2"
#combine into one, and then keep only the first 2 reps (based on diagnostics below)
posteriors12 <- posteriors %>% bind_rows(.id = "rep") %>%
filter(rep <= 2) %>% migrants.per.gen()
#posteriors12_long <- posteriors12 %>% pivot_longer(starts_with(c("Theta_", "M_",
# "Nm_","D_","d_")),
# names_to = "parameter",
# values_to = "value")
# read in a population key to translate numbers into names in figures
popkey <- read_csv("popkey.csv")
## Rows: 9 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Pop
## dbl (1): Index
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(popkey$Pop) <- popkey$Index
#make a new labeling key
posterior_parameters <- names(posteriors12) %>% str_subset("Theta|M_|Nm_|D_|d_")
parameter_key <- posterior_parameters %>%
str_replace("(\\d)_(\\d)", "\\1 to \\2") %>%
str_replace("M_", "m/mu " ) %>%
str_replace("Theta_", "theta ") %>%
str_replace_all(popkey$Pop) %>%
str_replace("Nm_", "4Nem ")
names(parameter_key) <- posterior_parameters
# posteriors.grouped <- posteriors %>%
# select(all_of(c("Locus",
# "Steps",
# posterior_parameters))) %>%
# group_by(rep)
# posterior.reps <- group_split(posterior.grouped)
# posterior.key <- group_keys(posterior.grouped)
#thin by taking every other step
posteriors2 <- lapply(posteriors,filter,Steps%%500==0)
# select parameters only
posteriors2 <- lapply(posteriors2, select, starts_with(c("Theta_", "M_",
"Nm_","D_","d_")))
posteriors.mcmc <- mcmc.list(lapply(posteriors2,mcmc,thin=500))
posterior.ggs <- ggs(posteriors.mcmc, burnin = F)
effectiveSize(posteriors2[[1]])
#ggmcmc(posterior.ggs, param_page = 4)
Effective sizes are good except for M_3_4.
effectiveSize(posteriors2[[1]]) (first replicate)
Theta_1 Theta_2 Theta_3 Theta_4 Theta_5 Theta_6 Theta_7 Theta_8
127773.10928 60095.43078 58552.43846 71943.00597 77691.51114 61662.55359 45230.02725 64583.65543
Theta_9 M_6_1 M_3_2 M_2_3 M_4_3 M_3_4 M_5_4 M_4_5
79323.26808 147192.07123 67813.55644 448.34109 1473.22013 68.48918 3983.63608 39822.63084
M_6_5 M_5_6 M_7_6 M_6_7 M_8_7 M_7_8 M_9_8 M_8_9
140352.48682 316433.45752 333193.44720 63909.27617 63612.89938 653891.00000 38723.33119 441357.70803
D_6_1
202634.81598
effectiveSize(posteriors.mcmc[c(1,2)])
Theta_1 Theta_2 Theta_3 Theta_4 Theta_5 Theta_6 Theta_7 Theta_8 Theta_9 M_6_1
268101.80 175117.21 86454.19 84954.50 91454.47 134416.13 127731.50 110109.44 174394.67 313435.43
M_3_2 M_2_3 M_4_3 M_3_4 M_5_4 M_4_5 M_6_5 M_5_6 M_7_6 M_6_7
674097.79 236556.17 421717.30 218933.72 390887.59 272926.62 585871.37 589532.54 613794.87 495933.81
M_8_7 M_7_8 M_9_8 M_8_9 D_6_1
421019.97 667326.66 39515.35 531464.72 415678.34
Gelman-Rubin diagnostic is a bit high for M_ parameters, but if I log transform them, all Gelman diagnostics are below 1.2.
gelman.diag(posteriors.mcmc)
Potential scale reduction factors:
Point est. Upper C.I.
Theta_1 1.00 1.00
Theta_2 1.01 1.02
Theta_3 1.00 1.01
Theta_4 1.00 1.01
Theta_5 1.00 1.00
Theta_6 1.00 1.01
Theta_7 1.01 1.02
Theta_8 1.00 1.00
Theta_9 1.01 1.01
M_6_1 1.00 1.00
M_3_2 1.20 1.25
M_2_3 1.29 1.34
M_4_3 1.28 1.34
M_3_4 1.29 1.82
M_5_4 1.12 1.15
M_4_5 1.06 1.06
M_6_5 1.04 1.04
M_5_6 1.05 1.05
M_7_6 1.01 1.01
M_6_7 1.20 1.20
M_8_7 1.01 1.01
M_7_8 1.17 1.17
M_9_8 1.24 1.40
M_8_9 1.28 1.30
D_6_1 1.00 1.00
Multivariate psrf
1.03
gelman.diag(posteriors.mcmc, transform = T)
Potential scale reduction factors:
Point est. Upper C.I.
Theta_1 1.00 1.00
Theta_2 1.00 1.01
Theta_3 1.00 1.00
Theta_4 1.00 1.01
Theta_5 1.00 1.00
Theta_6 1.00 1.01
Theta_7 1.00 1.01
Theta_8 1.00 1.00
Theta_9 1.00 1.01
M_6_1 1.00 1.00
M_3_2 1.01 1.03
M_2_3 1.01 1.03
M_4_3 1.01 1.02
M_3_4 1.02 1.04
M_5_4 1.01 1.02
M_4_5 1.00 1.01
M_6_5 1.00 1.01
M_5_6 1.01 1.02
M_7_6 1.00 1.01
M_6_7 1.00 1.01
M_8_7 1.00 1.00
M_7_8 1.01 1.01
M_9_8 1.01 1.01
M_8_9 1.00 1.01
D_6_1 1.00 1.00
Multivariate psrf
1.02
Here are plots of each parameter posterior, with each colored line being the posterior from one of 106 loci. Next thing I have to do is code up the summation described by Peter above.
\(\theta=4N_e\mu\)
Theta_plot <- posteriors12_long %>%
filter(str_detect(parameter, "Theta_")) %>%
ggplot() +
geom_density(mapping = aes(x = value,
group=Locus, color =Locus), weight = 0.5) +
#scale_x_log10(limits=c(1e-4,0.1)) +
facet_wrap(~parameter,
scales = "free_y",
labeller = labeller(parameter= parameter_key),
dir = "h",nrow=2) +
theme(axis.text.x = element_text(size = rel(0.6)))
Theta_plot
Proportion of migrants relative to mutation rate \(\frac{m}{\mu}\)
M_plot <- posterior_long %>%
filter(str_detect(parameter, "M_")) %>%
ggplot() +
geom_density(mapping = aes(x = value,
group=Locus, color =Locus), weight = 0.5) +
scale_x_log10(limits=c(1e-4,1000)) +
facet_wrap(~parameter, scales = "free_y",
labeller = labeller(parameter= parameter_key),
dir = "v",nrow=2) +
theme(axis.text.x = element_text(size = rel(0.6)))
M_plot
Migrants per generation \(4N_em\)
Nm_plot <- posterior_long %>%
filter(str_detect(parameter, "Nm_")) %>%
ggplot() +
geom_density(mapping = aes(x = value, group=Locus, color
=Locus), weight = 0.5) +
scale_x_log10(limits=c(1e-6,10)) +
facet_wrap(~parameter, scales = "free_y",
labeller = labeller(parameter= parameter_key),
dir = "v",nrow=2) +
theme(axis.text.x = element_text(size = rel(0.6)))
Nm_plot
Here are the priors I’m using, as a reminder.
bayes-priors= THETA WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
bayes-priors= MIG WEXPPRIOR: 0.000100 1000.000000 10000 100
bayes-priors= SPLIT * * WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
bayes-priors= SPLITSTD * * WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
Here are the basics for doing it over two loci:
# make density objects for two loci (n must be a power of 2, using 2^14)
testdens1 <- density(posterior$Theta_1[which(posterior$Locus==1)], n=16384,
from = 0, to = 0.1, bw = "SJ")
testdens2 <- density(posterior$Theta_1[which(posterior$Locus==2)], n=16384,
from = 0, to = 0.1, bw = "SJ")
#create the prior
thetaprior <- dexp(testdens1$x, rate = 0.001, log = F)
#create an empty tibble
densum <- tibble(.rows=16384)
densum$x <- testdens1$x
#remove the prior and standardize
densum$l1 <- (testdens1$y/ thetaprior) / sum(testdens1$y/ thetaprior)
densum$l2 <- (testdens2$y/ thetaprior) / sum(testdens2$y/ thetaprior)
#sum over loci and standardize again
densum$sum <- exp(log(densum$l1+1e-20) + log(densum$l2 + 1e-20) +
log(thetaprior)) / sum( exp(log(densum$l1+1e-20) +
log(densum$l2 + 1e-20) +
log(thetaprior)))
#pivot to long format
densum_long <- pivot_longer(densum,cols=2:4,names_to = "yplot", values_to = "y")
#plot
ggplot(densum_long, aes(x = x, y = y, group = yplot, color = yplot)) +
geom_line() + scale_x_log10()
#bayes-priors= THETA WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
#bayes-priors= MIG WEXPPRIOR: 0.000100 1000.000000 10000 100
These functions are for reconstructing distributions from the
bayesfile which consists of distributions for each locus
calculated as histograms
bayesfilenames <- c("Locus","pnumber","HPC50","HPC95","value","count",
"freq","cumFreq","priorFreq")
DirB <- "/Users/eric/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Research/Ptuberculosa/migrate/run4.6"
bf <- read_table(file = file.path(DirB,paste0("rep",1),winningModel,
"bayesfile"),
col_types = "ifiiddddd", col_names = bayesfilenames,
comment = "#", skip_empty_rows = T)
parameter_number_table <- read_table(file = file.path(DirB,paste0("rep",1),winningModel,
"bayesfile"),
col_types = "cic",
col_names = c("drop","pnumber","parameter"),
skip=9, n_max = 24) %>%
select(-drop) %>%
mutate(parameter = str_replace_all(parameter, ",","_")) %>%
mutate(parameter = str_remove_all(parameter,"\\(")) %>%
mutate(parameter = str_remove_all(parameter,"\\)")) %>%
left_join(tibble(parameter_name =
parameter_key,parameter=names(parameter_key)),
by = "parameter")
parameter_number_key <- parameter_number_table$parameter_name
names(parameter_number_key) <- parameter_number_table$pnumber
#bf %>% ggplot(aes(x=value, y = freq, color = Locus)) +
# geom_line() + scale_color_continuous() +
# facet_wrap(vars(Parameter), scales = "free")
# bf %>% filter(pnumber %in% 1:9) %>% filter(Locus %in% 1:109) %>%
# ggplot(aes(x=value, y = freq, color = Locus)) + geom_line() +
# facet_wrap(vars(pnumber), scales = "free")
bf %>% filter(pnumber %in% 1:9) %>% filter(Locus == 110) %>%
ggplot(aes(x = value, y = freq, color = pnumber)) + geom_line() +
scale_color_brewer(palette = "YlGnBu", labels = parameter_number_key) +
ylim(0,0.5) + xlim(0,0.01) + geom_line(aes(x = value, y = priorFreq),
color = "grey", linetype="dotted")
## Warning: Removed 16200 rows containing missing values (`geom_line()`).
## Removed 16200 rows containing missing values (`geom_line()`).
These functions are for reconstructing distributions from the “raw”
posterior output, which Peter calls a bayesallfile.
remove_prior <- function(densityd,prior,threshold = 1e-10){
# this function changes values less than a threshold
# to zero removes a prior from the
# y values of a density distributions (density).
# first it zeros values less than a threshold, then
# removes the prior, then renormalizes so the density
# sums to 1
prior[which(prior < 1e-30)] <- 1e-30
densityd[which(densityd < threshold)] <- 0
new <- (densityd/prior)/sum(densityd/prior)
return(new)
}
sum_over_loci <- function(df,parameter){
#this function takes a data frame of probability densities for many loci
# that have had the prior removed,
# together with with a logged prior named "logPrior",
# as well as the name of a parameter (e.g. "Theta_1")
# and sums the densities over loci.
# Rmpfr package allows quadruple precision for calcs on very small numbers.
require(Rmpfr)
#add a teeny-tiny amount to all values to avoid zeros
df2 <- df %>% mutate(across(starts_with(parameter),
.fns= function(x) x + 1e-11)) %>%
#log all values
mutate(across(starts_with(c(parameter)),
.fns=log)) %>%
# convert the df to rowwise so that rows can be summed
# and then sum across the row, including the prior
rowwise() %>%
mutate(sum_param_prior =
sum(c_across(starts_with(c(parameter,"logPrior"))))) %>%
#convert back to a regular df
ungroup()
#need to convert to quadruple precision because
#these will exponentiate to very small numbers.
sum_param_prior_exp <- exp(mpfr(df2$sum_param_prior, precBits = 128))
# standardize by dividing by the sum of the column
sum_param_prior_standardized <-
sum_param_prior_exp/sum(sum_param_prior_exp)
#drop the intermediate columns (no longer needed), change the standardized
# output back to double precision so that it can be incorporated into the df
# rename the summed column after the parameter
df3 <- df2 %>% select(-c(sum_param_prior)) %>%
mutate(summed_over_loci =
as.numeric(sum_param_prior_standardized)) %>%
rename_with(.fn = ~ paste(parameter),
.cols = summed_over_loci)
return(df3)
}
summarize_posterior <- function(posterior,
parameter_type = c("Theta","M","Nm","D"),
prior_params = c(min,mean,max),
n=16384){
# this function takes a Migrate-n posterior "bayesallfile" as a dataframe
# as well as one of the parameter types, and the prior on that parameter
# as a vector c(min,mean,max). Currently only exponential priors supported
# it will create densities for each parameter of the given type,
# remove the prior from each, sum across loci, and re-add the prior (once)
parameters <- names(posterior) %>%
str_subset(parameter_type)
# create a tibble with the x values for all density plots
# for a particular parameter type
getx <- posterior %>% filter(Locus == 1) %>%
select(parameters[1])
# create a tibble with x values for a density plot
# of the chosen number of points
dens <- tibble(.rows = n, x = density(getx[,1], n=n,
from = prior_params[1],
to = prior_params[3],
bw = "nrd0")$x)
print("calculating densities")
# calculate densities for each parameter of a given type at each locus
dens <- posterior %>%
select(starts_with(c("Steps","Locus","rep",
paste0(parameter_type,"_")))) %>%
pivot_wider(names_from = "Locus", values_from =
starts_with(paste0(parameter_type,"_")),
names_sep = "-") %>%
select(starts_with(paste0(parameter_type,"_"))) %>%
map_dfc(function(x) density(x, n = n, from =
prior_params[1],
to = prior_params[3],
bw = "nrd0")$y) %>%
bind_cols(dens)
# create, standardize, log and remove prior
dens$prior <- dexp(dens$x, rate = 1/prior_params[2],
log = F)
dens$prior <- dens$prior/sum(dens$prior)
dens$logPrior <- log(dens$prior)
print("removing prior")
dens2 <- dens %>%
#remove the prior, standardize
mutate(across(starts_with(parameter_type),
~ remove_prior(densityd = .x,
prior = dens$prior,
threshold = 1e-10) ))
dens3 <- dens2
for(p in parameters){
print(p)
dens3 <- sum_over_loci(df = dens3, parameter = p)
}
# trying to do the above loop with purrr:map_dfc
#dens4 <- parameters %>%
# map_dfc(.f = ~ sum_over_loci(df = dens2, parameter = .x))
return(dens3)
}
posterior_stats <- function(df,parameter){
require(spatstat.explore)
p <- df %>% select(x,parameter) %>% as.list(bw = 6.33e-05)
names(p) <- c("x", "y")
p$bw <- 6.33e-5
attr(p, "class") <- "density"
qu <- quantile.density(p, c(0.025, 0.25, 0.5, 0.75, 0.975))
wmo <- p$x[which(p$y==max(p$y))]
wme <- weighted.mean(p$x, p$y)
wsd <- sqrt(weighted.var(p$x, p$y))
stats <- c(qu,mode = wmo, mean = wme, sd = wsd)
return(stats)
}
These are the final estimates, summed over loci.
For the model with ongoing gene flow between Kure and Oahu
And here I run the code for the \(\theta\) estimates
parameter_type <- "Theta"
parameters <- names(posteriors12) %>%
str_subset(parameter_type)
parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]
#summarize the posterior for theta
thetas <- summarize_posterior(posteriors12, parameter_type = parameter_type,
prior_params = c(0,0.001,0.1))
#convert to long format
thetas_long <- thetas %>% select(all_of(c("x","prior",parameters))) %>%
pivot_longer(starts_with(c(paste0(parameter_type,"_"))),
names_to = "Parameter",
values_to = "Density")
theta.colors <- rev(brewer.pal(9,"Spectral"))
theta.colors.light <- lighten(theta.colors)
theta.colors.dark <- darken(theta.colors)
#plot
thetas_density <- ggplot(thetas_long, aes(x = x, y = Density,
color = Parameter)) +
geom_line(size = 1) +
#scale_x_log10(limits = c(5e-4,1e-2)) +
scale_color_discrete(type = theta.colors,
labels = parameter_key) +
geom_line(aes(x = x, y=prior), color = "grey",
linetype="dotted") +
ylim(0,0.175) + xlim(0,0.01)
thetas_ridges <- ggplot(thetas_long, aes(x = x, y = Parameter, height = Density,
fill = Parameter)) +
geom_density_ridges(stat = "identity") +
scale_fill_discrete(type = theta.colors, labels = parameter_key) +
scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
labs(x = TeX("$\\Theta$")) + xlim(0,0.01)
thetas_violins <- ggplot(thetas_long, aes(x = Parameter, y = x,
violinwidth = Density*10,
fill = Parameter)) +
geom_violin(stat = "identity") +
scale_fill_discrete(type = theta.colors, labels = parameter_key) +
scale_x_discrete(limits = rev, labels = NULL, breaks = NULL) +
labs(y = TeX("$\\Theta$")) + ylim(0,0.01)
# get stats for each summed-over-loci posterior
theta_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = thetas,
parameter = .x),
.id = "parameter")
Theta estimates:
thetastats <- read_csv("./tables/stepping.stone.KO.D/theta_stats.csv")
## Rows: 9 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): parameter
## dbl (8): 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
thetastats
## # A tibble: 9 × 9
## parameter `2.5%` `25%` `50%` `75%` `97.5%` mode mean sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 theta Kure 0.00543 0.00559 0.00568 0.00575 0.00596 0.00569 0.00567 1.61e-4
## 2 theta P&H 0.00112 0.00123 0.00129 0.00136 0.00148 0.00128 0.00130 9.25e-5
## 3 theta Maro 0.00674 0.00698 0.00699 0.00700 0.00701 0.00699 0.00698 5.73e-5
## 4 theta FFS 0.00621 0.00678 0.00680 0.00682 0.00687 0.00680 0.00678 1.37e-4
## 5 theta Kauai 0.00797 0.00801 0.00803 0.00804 0.00807 0.00803 0.00802 2.73e-5
## 6 theta Oahu 0.00762 0.00765 0.00767 0.00768 0.00772 0.00767 0.00767 2.62e-5
## 7 theta Molokai 0.00678 0.00684 0.00686 0.00689 0.00775 0.00685 0.00693 2.44e-4
## 8 theta Maui 0.00771 0.00779 0.00783 0.00785 0.00789 0.00784 0.00781 4.82e-5
## 9 theta Big Is. 0.00632 0.00638 0.00640 0.00643 0.00649 0.00640 0.00639 1.10e-4
parameter_type <- "M"
parameters <- names(posteriors12) %>%
str_subset(parameter_type)
parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]
#summarize the posterior for theta
Ms <- summarize_posterior(posteriors12, parameter_type = parameter_type,
prior_params = c(0,1,1000), n = 2^14)
#convert to long format
Ms_long <- Ms %>% select(all_of(c("x","prior",parameters))) %>%
pivot_longer(starts_with(c(paste0(parameter_type,"_"))),
names_to = "Parameter",
values_to = "Density") %>%
# filter(Parameter != "M_6_1")
migration.colors <- c(rbind(theta.colors, theta.colors.dark))
#plot
Ms_density <- ggplot(Ms_long, aes(x = x, y = Density,
color = Parameter)) +
geom_line(size = 1) +
scale_x_log10(limits = c(1e-2,100)) +
scale_color_discrete(type = migration.colors, labels = parameter_key) +
geom_line(aes(x = x, y=prior), color = "grey",
linetype="dotted")
#plot
M_ridges <- ggplot(Ms_long, aes(x = x, y = Parameter, height = Density,
fill = Parameter)) +
geom_density_ridges(stat = "identity") +
scale_x_log10(limits = c(1e-2,100)) +
scale_fill_discrete(type = migration.colors, labels = parameter_key) +
scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
labs(x = TeX("$\\frac{m}{\\mu}$"))
M_violins <- ggplot(Ms_long, aes(x = Parameter, y = x, violinwidth = Density,
fill = Parameter)) +
geom_violin(stat = "identity") +
scale_y_log10(limits = c(1e-2,100)) +
scale_fill_discrete(type = migration.colors, labels = parameter_key)
# get stats for each summed-over-loci posterior
Ms_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = Ms,
parameter = .x),
.id = "parameter")
M/mu estimates:
mmustats <- read_csv("./tables/stepping.stone.KO.D/Mmu_stats.csv")
## Rows: 15 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): parameter
## dbl (8): 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mmustats
## # A tibble: 15 × 9
## parameter `2.5%` `25%` `50%` `75%` `97.5%` mode mean sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 m/mu Oahu to Kure 66.3 66.7 67.3 68.3 71.6 66.3 67.7 1.46
## 2 m/mu Maro to P&H 1.16 1.16 1.16 1.28 1.40 1.16 1.21 0.143
## 3 m/mu P&H to Maro 1.10 1.16 1.22 1.22 1.28 1.16 1.20 0.0680
## 4 m/mu FFS to Maro 0.244 0.244 0.244 0.244 0.305 0.244 0.249 0.0435
## 5 m/mu Maro to FFS 1.10 1.22 1.28 1.34 1.46 1.28 1.28 0.117
## 6 m/mu Kauai to FFS 0.549 0.671 0.732 0.794 0.916 0.671 0.724 0.104
## 7 m/mu FFS to Kauai 0.122 0.183 0.183 0.183 0.244 0.183 0.174 0.0461
## 8 m/mu Oahu to Kauai 0.183 0.183 0.244 0.488 0.610 0.183 0.308 0.183
## 9 m/mu Kauai to Oahu 0.305 0.427 0.488 0.488 0.610 0.488 0.465 0.0897
## 10 m/mu Molokai to Oahu 0.183 0.244 0.244 0.244 0.305 0.244 0.236 0.0499
## 11 m/mu Oahu to Molokai 0.366 0.427 0.427 0.488 0.549 0.427 0.454 0.0632
## 12 m/mu Maui to Molokai 1.04 1.22 1.28 1.40 1.65 1.28 1.30 0.149
## 13 m/mu Molokai to Maui 0.183 0.183 0.244 0.244 0.244 0.244 0.217 0.0439
## 14 m/mu Big Is. to Maui 0.732 0.916 1.10 1.16 1.22 1.16 1.04 0.173
## 15 m/mu Maui to Big Is. 1.95 1.95 1.95 2.01 2.01 1.95 1.98 0.0454
parameter_type <- "Nm"
parameters <- names(posteriors12) %>%
str_subset(parameter_type)
parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]
#summarize the posterior for theta
nms <- summarize_posterior(posteriors12, parameter_type = parameter_type,
prior_params = c(0,1,1), n = 2^17)
#convert to long format
nms_long <- nms %>% select(all_of(c("x","prior",parameters))) %>%
pivot_longer(starts_with(c(paste0(parameter_type,"_"))),
names_to = "Parameter",
values_to = "Density") %>%
filter(Parameter != "Nm_6_1")
migration.colors <- c(rbind(theta.colors, theta.colors.dark))
#plot
Nms_density <- ggplot(nms_long, aes(x = x, y = Density,
color = Parameter)) +
geom_line(size = 1) +
#scale_x_log10(limits = c(5e-4,1e-2)) +
scale_color_discrete(type = migration.colors, labels = parameter_key) +
geom_line(aes(x = x, y=prior), color = "grey",
linetype="dotted") +
ylim(0,0.125) + xlim(0,0.003)
#plot
nm_ridges <- ggplot(nms_long, aes(x = x, y = Parameter, height = Density,
fill = Parameter)) +
geom_density_ridges(stat = "identity") +
scale_fill_discrete(type = migration.colors, labels = parameter_key) +
scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
labs(x = TeX("$4N_em$")) +
xlim(0,0.003)
nm_violins <- ggplot(nms_long, aes(x = Parameter, y = x, violinwidth = Density*20,
fill = Parameter)) +
geom_violin(stat = "identity") +
scale_fill_discrete(type = migration.colors, labels = parameter_key) +
ylim(0,0.003)
# get stats for each summed-over-loci posterior
nms_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = nms,
parameter = .x),
.id = "parameter")
\(4N_em\) estimates:
nmstats <- read_csv("./tables/stepping.stone.KO.D/4Nm_stats.csv")
## Rows: 15 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): parameter
## dbl (8): 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nmstats
## # A tibble: 15 × 9
## parameter `2.5%` `25%` `50%` `75%` `97.5%` mode mean sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4Nem Oahu to… 9.09e-1 9.66e-1 9.83e-1 9.93e-1 9.99e-1 1 e+0 9.75e-1 2.46e-2
## 2 4Nem Maro to… 3.66e-4 4.27e-4 4.58e-4 4.96e-4 5.72e-4 4.50e-4 4.64e-4 5.47e-5
## 3 4Nem P&H to … 6.03e-4 6.71e-4 7.17e-4 7.63e-4 8.54e-4 7.10e-4 7.19e-4 6.61e-5
## 4 4Nem FFS to … 1.98e-4 2.21e-4 2.37e-4 2.59e-4 2.98e-4 2.37e-4 2.41e-4 2.66e-5
## 5 4Nem Maro to… 1.35e-3 1.52e-3 1.62e-3 1.72e-3 1.94e-3 1.61e-3 1.63e-3 1.51e-4
## 6 4Nem Kauai t… 6.49e-4 7.40e-4 7.93e-4 8.47e-4 9.69e-4 7.86e-4 7.97e-4 8.23e-5
## 7 4Nem FFS to … 8.70e-4 1.04e-3 1.14e-3 1.27e-3 1.54e-3 1.12e-3 1.16e-3 1.73e-4
## 8 4Nem Oahu to… 1.49e-3 1.74e-3 1.87e-3 2.01e-3 2.27e-3 1.86e-3 1.87e-3 2.01e-4
## 9 4Nem Kauai t… 1.08e-3 1.23e-3 1.31e-3 1.40e-3 1.57e-3 1.30e-3 1.31e-3 1.26e-4
## 10 4Nem Molokai… 9.99e-4 1.17e-3 1.26e-3 1.36e-3 1.56e-3 1.24e-3 1.26e-3 1.43e-4
## 11 4Nem Oahu to… 7.10e-4 8.24e-4 8.93e-4 9.61e-4 1.11e-3 8.85e-4 8.97e-4 1.03e-4
## 12 4Nem Maui to… 1.46e-3 1.65e-3 1.75e-3 1.86e-3 2.08e-3 1.75e-3 1.76e-3 1.61e-4
## 13 4Nem Molokai… 4.35e-4 5.04e-4 5.42e-4 5.80e-4 6.56e-4 5.34e-4 5.40e-4 5.67e-5
## 14 4Nem Big Is.… 1.26e-3 1.40e-3 1.47e-3 1.56e-3 1.75e-3 1.46e-3 1.48e-3 1.26e-4
## 15 4Nem Maui to… 1.17e-3 1.39e-3 1.52e-3 1.66e-3 2.05e-3 1.52e-3 1.54e-3 2.22e-4
##### 4NeM Estimate for Oahu - Kure
This parameter is orders of magnitude larger than the others, and effectively can’t be summarized under the same prior limits. So this code is just rejigged to focus on the Oahu-Kure parameter
parameter_type <- "Nm"
parameters <- names(posteriors12) %>%
str_subset(parameter_type)
parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]
#summarize the posterior for theta
nms61 <- summarize_posterior(posteriors12, parameter_type = parameter_type,
prior_params = c(1,1,1000), n = 2^14)
#convert to long format
nms_61long <- nms61 %>% select(all_of(c("x","prior",parameters))) %>%
pivot_longer(starts_with(c(paste0(parameter_type,"_"))),
names_to = "Parameter",
values_to = "Density") %>%
filter(Parameter == "Nm_6_1")
migration.colors <- c(rbind(theta.colors, theta.colors.dark))
#plot
Nms_61density <- ggplot(nms_61long, aes(x = x, y = Density,
color = Parameter)) +
geom_line(size = 1) +
#scale_x_log10(limits = c(5e-4,1e-2)) +
scale_color_discrete(type = migration.colors, labels = parameter_key) +
geom_line(aes(x = x, y=prior), color = "grey",
linetype="dotted") +
ylim(0,0.125) + xlim(0,100)
#plot
nm_61ridges <- ggplot(nms_61long, aes(x = x, y = Parameter, height = Density,
fill = Parameter)) +
geom_density_ridges(stat = "identity") +
scale_fill_discrete(type = migration.colors, labels = parameter_key) +
scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
labs(x = TeX("$4N_em$")) +
xlim(0,50)
nm_61violins <- ggplot(nms_61long, aes(x = Parameter, y = x, violinwidth = Density*20,
fill = Parameter)) +
geom_violin(stat = "identity") +
scale_fill_discrete(type = migration.colors, labels = parameter_key) +
ylim(0,50)
# get stats for each summed-over-loci posterior
nms_61stats <-posterior_stats(df = nms61, parameter = "Nm_6_1")
nm61stats <- read.csv("./tables/stepping.stone.KO.D/4Nm_Oahu_Kure.csv")
nm61stats
## X x
## 1 2.5% 34.6597693
## 2 25% 34.7207471
## 3 50% 34.7207471
## 4 75% 34.7207471
## 5 97.5% 38.1355063
## 6 mode 34.7207471
## 7 mean 34.7966844
## 8 sd 0.8936556
This is the divergence
dens <- tibble(.rows = 2^15, x = density(getx[,1], n=2^15,
from = 0,
to = 0.0001,
bw = "nrd0")$x)
dens2 <- posteriors12 %>%
select(starts_with(c("Steps","Locus","rep",
paste0(parameter_type,"_")))) %>%
pivot_wider(names_from = "Locus", values_from =
starts_with(paste0(parameter_type,"_")),
names_sep = "-") %>%
select(4:112) %>%
map_dfc(function(x) density(x, n = 2^15, from = 0, to = 0.001, bw = "nrd0")$y)
dens2<-bind_cols(dens,dens2)
dens3$prior <- dexp(dens3$x, rate = 1/0.001,log = F)
dens3$prior <- dens3$prior/sum(dens3$prior)
dens3$logPrior <- log(dens3$prior)
dens4 <- dens3 %>%
#remove the prior, standardize
mutate(across(starts_with(parameter_type),~ remove_prior(densityd = .x,
prior = dens$prior,threshold = 1e-10) ))
names(dens4)[1:109] <- paste0("D_6_1-",names(dens4)[1:109])
dens5 <- sum_over_loci(df = dens4, parameter = "D_6_1")
#convert to long format
D_61long <- dens5 %>% select(all_of(c("x","prior",parameters))) %>%
pivot_longer(starts_with(c(paste0(parameter_type,"_"))),
names_to = "Parameter",
values_to = "Density") %>%
filter(Parameter == "D_6_1")
migration.colors <- c(rbind(theta.colors, theta.colors.dark))
#plot
D_61density <- ggplot(D_61long, aes(x = x, y = Density,
color = Parameter)) +
geom_line(size = 1) +
#scale_x_log10(limits = c(5e-5,1e-2)) +
scale_color_discrete(type = migration.colors, labels = parameter_key) +
geom_line(aes(x = x, y=prior), color = "grey",
linetype="dotted") +
ylim(0,0.02) + xlim(0,0.001)
# get stats for each summed-over-loci posterior
D_61stats <-posterior_stats(df = dens5, parameter = "D_6_1")
D61stats <- read.csv("./tables/stepping.stone.KO.D/D_6_1_stats.csv")
D61stats
## X x
## 1 2.5% 5.249184e-04
## 2 25% 5.722221e-04
## 3 50% 5.996887e-04
## 4 75% 6.271554e-04
## 5 97.5% 6.851405e-04
## 6 mode 5.951109e-04
## 7 mean 6.006982e-04
## 8 sd 4.126898e-05
Divergence time in coalescent units (working from page 25 of )
tau <- 5.951109e-04 / (0.00767+0.00569)
tau
## [1] 0.04454423
t <- tau / 1.2e-8
This is the same model, without ongoing gene flow between Kure and Oahu
#
Dir <- "/Users/eric/Library/CloudStorage/OneDrive-ThePennsylvaniaStateUniversity/Research/Ptuberculosa/migrate/run4.7"
winningModel <- "stepping.stone.KOd"
posteriors <- list()
#read in the posterior for two replicates this may take 5 minutes
for(r in 1:5){
print(paste("Loading Rep",r))
posteriors[[r]] <- read.table(file.path(Dir,paste0("rep",r),winningModel,
"bayesallfile"), header=T)
}
posteriors <- lapply(posteriors, select, starts_with(c("Locus","Steps","Theta_", "M_",
"Nm_","D_","d_")))
#combine into one, and then keep only the first 2 reps (based on diagnostics below)
posteriors12 <- posteriors %>% bind_rows(.id = "rep") %>%
migrants.per.gen() %>% mutate(rep = as.integer(rep))
#posteriors12_long <- posteriors12 %>% pivot_longer(starts_with(c("Theta_", "M_",
# "Nm_","D_","d_")),
# names_to = "parameter",
# values_to = "value")
# read in a population key to translate numbers into names in figures
popkey <- read_csv("popkey.csv")
names(popkey$Pop) <- popkey$Index
#make a new labeling key
posterior_parameters <- names(posteriors15) %>% str_subset("Theta|M_|Nm_|D_|d_")
parameter_key <- posterior_parameters %>%
str_replace("(\\d)_(\\d)", "\\1 to \\2") %>%
str_replace("M_", "m/mu " ) %>%
str_replace("Theta_", "theta ") %>%
str_replace_all(popkey$Pop) %>%
str_replace("Nm_", "4Nem ")
names(parameter_key) <- posterior_parameters
# posteriors.grouped <- posteriors %>%
# select(all_of(c("Locus",
# "Steps",
# posterior_parameters))) %>%
# group_by(rep)
# posterior.reps <- group_split(posterior.grouped)
# posterior.key <- group_keys(posterior.grouped)
#thin by taking every other step
posteriors2 <- lapply(posteriors,filter,Steps%%500==0)
# select parameters only
posteriors2 <- lapply(posteriors2, select, starts_with(c("Theta_", "M_",
"Nm_","D_","d_")))
posteriors.mcmc <- mcmc.list(lapply(posteriors2,mcmc,thin=500))
posterior.ggs <- ggs(posteriors.mcmc, burnin = F)
ggmcmc(posterior.ggs, param_page = 4)
#if I wanted to pull out parameters by locus, but I don't - Peter calculates Effective Sizes on the whole posterior
posteriors3 <- posteriors15 %>% select(starts_with(c("Replicate","Steps","Locus","Theta_", "M_",
"Nm_","D_","d_"))) %>% mutate(Locus_name = Locus)
posteriors3_Nm <- posteriors3 %>% select(starts_with(c("Replicate","Steps","Locus",
"Nm_"))
posteriors3_Nm <- posteriors3 %>% pivot_wider(id_cols = c("Steps", "Replicate","Locus"),
names_from = "Locus_name",
values_from =starts_with(c("Theta_", "M_",
"Nm_","D_","d_")), names_sep = "-L")
posteriors3_Nm.mcmc <- mcmc(posteriors3_Nm)
effectiveSize(posteriors3_Nm.mcmc)
Effective sizes are good
effectiveSize(posteriors2[[1]]) (first replicate)
Theta_1 Theta_2 Theta_3 Theta_4 Theta_5 Theta_6 Theta_7 Theta_8 Theta_9 M_3_2 M_2_3 M_4_3 M_3_4
140194.20 99698.77 114330.54 65325.45 59135.13 73193.97 109083.37 70626.00 107475.10 50591.32 172911.83 90997.00 269569.09
M_5_4 M_4_5 M_6_5 M_5_6 M_7_6 M_6_7 M_8_7 M_7_8 M_9_8 M_8_9 D_6_1
327963.24 297808.14 273528.90 337943.73 270026.70 354645.35 350985.75 653891.00 653891.00 653891.00 170758.40
Gelman-Rubin diagnostic across 5 replicates is a bit high for some M_ parameters, but all below 1.2.
gelman.diag(posteriors.mcmc, autoburnin = F)
Potential scale reduction factors:
Point est. Upper C.I.
Theta_1 1.00 1.00
Theta_2 1.00 1.01
Theta_3 1.01 1.02
Theta_4 1.01 1.01
Theta_5 1.01 1.03
Theta_6 1.00 1.00
Theta_7 1.00 1.00
Theta_8 1.00 1.01
Theta_9 1.01 1.01
M_3_2 1.14 1.17
M_2_3 1.13 1.13
M_4_3 1.12 1.12
M_3_4 1.07 1.07
M_5_4 1.15 1.15
M_4_5 1.00 1.00
M_6_5 1.01 1.01
M_5_6 1.12 1.12
M_7_6 1.06 1.06
M_6_7 1.10 1.10
M_8_7 1.23 1.24
M_7_8 1.10 1.10
M_9_8 1.12 1.12
M_8_9 1.21 1.25
D_6_1 1.00 1.00
Multivariate psrf
1.01
gelman.diag(posteriors.mcmc, transform = T,autoburnin=F)
Potential scale reduction factors:
Potential scale reduction factors:
Point est. Upper C.I.
Theta_1 1.00 1.00
Theta_2 1.00 1.00
Theta_3 1.01 1.02
Theta_4 1.00 1.01
Theta_5 1.01 1.02
Theta_6 1.00 1.00
Theta_7 1.00 1.00
Theta_8 1.00 1.00
Theta_9 1.00 1.00
M_3_2 1.01 1.01
M_2_3 1.00 1.01
M_4_3 1.01 1.02
M_3_4 1.00 1.01
M_5_4 1.00 1.01
M_4_5 1.00 1.01
M_6_5 1.00 1.01
M_5_6 1.00 1.01
M_7_6 1.00 1.00
M_6_7 1.01 1.01
M_8_7 1.01 1.02
M_7_8 1.01 1.01
M_9_8 1.01 1.02
M_8_9 1.01 1.02
D_6_1 1.00 1.00
Multivariate psrf
1.02
And here I run the code for the \(\theta\) estimates
parameter_type <- "Theta"
parameters <- names(posteriors12) %>%
str_subset(parameter_type)
parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]
#summarize the posterior for theta
thetas <- summarize_posterior(posteriors12, parameter_type = parameter_type,
prior_params = c(0,0.001,0.1))
#convert to long format
thetas_long <- thetas %>% select(all_of(c("x","prior",parameters))) %>%
pivot_longer(starts_with(c(paste0(parameter_type,"_"))),
names_to = "Parameter",
values_to = "Density")
theta.colors <- rev(brewer.pal(9,"Spectral"))
theta.colors.light <- lighten(theta.colors)
theta.colors.dark <- darken(theta.colors)
#plot
thetas_density <- ggplot(thetas_long, aes(x = x, y = Density,
color = Parameter)) +
geom_line(linewidth= 1) +
#scale_x_log10(limits = c(5e-4,1e-2)) +
scale_color_discrete(type = theta.colors,
labels = parameter_key) +
geom_line(aes(x = x, y=prior), color = "grey",
linetype="dotted") +
ylim(0,0.175) + xlim(0,0.01)
thetas_ridges <- ggplot(thetas_long, aes(x = x, y = Parameter, height = Density,
fill = Parameter)) +
geom_density_ridges(stat = "identity", scale = 5) +
scale_fill_discrete(type = theta.colors, labels = parameter_key) +
scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
labs(x = TeX("$\\Theta$")) + xlim(0,0.01)
thetas_violins <- ggplot(thetas_long, aes(x = Parameter, y = x,
violinwidth = Density*10,
fill = Parameter)) +
geom_violin(stat = "identity") +
scale_fill_discrete(type = theta.colors, labels = parameter_key) +
scale_x_discrete(limits = rev, labels = NULL, breaks = NULL) +
labs(y = TeX("$\\Theta$")) + ylim(0,0.01)
#ggsave(filename = "thetas_density.jpg",plot=thetas_density, path = "./figures/stepping.stone.KOd")
#ggsave(filename = "thetas_ridges.jpg",plot=thetas_ridges, path = "./figures/stepping.stone.KOd")
#ggsave(filename = "thetas_violins.jpg",plot=thetas_violins, path = "./figures/stepping.stone.KOd")
# get stats for each summed-over-loci posterior
theta_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = thetas,
parameter = .x),
.id = "parameter")
#write_csv(theta_stats,"./tables/stepping.stone.KOd/theta_stats.csv")
Theta estimates:
thetastats <- read_csv("./tables/stepping.stone.KOd/theta_stats.csv")
## Rows: 9 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): parameter
## dbl (8): 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
thetastats
## # A tibble: 9 × 9
## parameter `2.5%` `25%` `50%` `75%` `97.5%` mode mean sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 theta Kure 0.00393 0.00418 0.00425 0.00435 0.00520 0.00423 0.00429 2.42e-4
## 2 theta P&H 0.00132 0.00146 0.00152 0.00159 0.00173 0.00150 0.00153 1.04e-4
## 3 theta Maro 0.00667 0.00670 0.00671 0.00672 0.00675 0.00671 0.00671 2.10e-5
## 4 theta FFS 0.00778 0.00780 0.00782 0.00783 0.00786 0.00782 0.00781 6.22e-5
## 5 theta Kauai 0.00773 0.00778 0.00780 0.00783 0.00788 0.00779 0.00780 3.97e-5
## 6 theta Oahu 0.00740 0.00745 0.00747 0.00749 0.00753 0.00748 0.00747 3.33e-5
## 7 theta Molokai 0.00691 0.00697 0.00701 0.00707 0.00716 0.00699 0.00702 7.19e-5
## 8 theta Maui 0.00726 0.00731 0.00734 0.00736 0.00740 0.00734 0.00734 4.51e-5
## 9 theta Big Is. 0.00498 0.00513 0.00536 0.00545 0.00623 0.00541 0.00540 3.62e-4
parameter_type <- "M"
parameters <- names(posteriors12) %>%
str_subset(parameter_type)
parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]
#summarize the posterior for theta
Ms <- summarize_posterior(posteriors12, parameter_type = parameter_type,
prior_params = c(0,1,1000), n = 2^14)
#convert to long format
Ms_long <- Ms %>% select(all_of(c("x","prior",parameters))) %>%
pivot_longer(starts_with(c(paste0(parameter_type,"_"))),
names_to = "Parameter",
values_to = "Density")
migration.colors <- c(rbind(theta.colors, theta.colors.dark))
#plot
M_density <- ggplot(Ms_long, aes(x = x, y = Density,
color = Parameter)) +
geom_line(size = 1) +
scale_x_log10(limits = c(1e-2,100)) +
scale_color_discrete(type = migration.colors, labels = parameter_key) +
geom_line(aes(x = x, y=prior), color = "grey",
linetype="dotted")
#plot
M_ridges <- ggplot(Ms_long, aes(x = x, y = Parameter, height = Density,
fill = Parameter)) +
geom_density_ridges(stat = "identity") +
scale_x_log10(limits = c(1e-2,100)) +
scale_fill_discrete(type = migration.colors, labels = parameter_key) +
scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
labs(x = TeX("$\\frac{m}{\\mu}$"))
M_violins <- ggplot(Ms_long, aes(x = Parameter, y = x, violinwidth = Density,
fill = Parameter)) +
geom_violin(stat = "identity") +
scale_y_log10(limits = c(1e-2,100)) +
scale_fill_discrete(type = migration.colors, labels = parameter_key)
# get stats for each summed-over-loci posterior
Ms_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = Ms,
parameter = .x),
.id = "parameter")
#ggsave(filename = "M_density.jpg",plot=M_density, path = "./figures/stepping.stone.KOd")
#ggsave(filename = "M_ridges.jpg",plot=M_ridges, path = "./figures/stepping.stone.KOd")
#ggsave(filename = "M_violins.jpg",plot=M_violins, path = "./figures/stepping.stone.KOd")
#write_csv(Ms_stats,"./tables/stepping.stone.KOd/Mmu_stats.csv")
M/mu estimates:
mmustats <- read_csv("./tables/stepping.stone.KOd/Mmu_stats.csv")
## Rows: 14 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): parameter
## dbl (8): 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mmustats
## # A tibble: 14 × 9
## parameter `2.5%` `25%` `50%` `75%` `97.5%` mode mean sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 m/mu Maro to P&H 1.28 1.34 1.40 1.40 1.46 1.40 1.37 0.0663
## 2 m/mu P&H to Maro 0.916 1.04 1.04 1.10 1.28 1.04 1.07 0.0989
## 3 m/mu FFS to Maro 0.244 0.244 0.244 0.244 0.305 0.244 0.249 0.0434
## 4 m/mu Maro to FFS 1.53 2.01 2.08 2.08 2.14 2.08 1.96 0.251
## 5 m/mu Kauai to FFS 0.183 0.244 0.244 0.244 0.305 0.244 0.253 0.0596
## 6 m/mu FFS to Kauai 0.183 0.244 0.244 0.244 0.305 0.244 0.244 0.0532
## 7 m/mu Oahu to Kauai 0.122 0.122 0.183 0.244 0.305 0.183 0.182 0.0634
## 8 m/mu Kauai to Oahu 0.183 0.183 0.183 0.244 0.305 0.183 0.212 0.0582
## 9 m/mu Molokai to Oahu 0.122 0.183 0.183 0.183 0.244 0.183 0.186 0.0449
## 10 m/mu Oahu to Molokai 0.305 0.366 0.366 0.366 0.427 0.366 0.364 0.0540
## 11 m/mu Maui to Molokai 1.04 1.16 1.22 1.28 1.34 1.16 1.20 0.110
## 12 m/mu Molokai to Maui 0.183 0.183 0.244 0.244 0.244 0.244 0.222 0.0435
## 13 m/mu Big Is. to Maui 1.04 1.10 1.16 1.16 1.16 1.16 1.13 0.0548
## 14 m/mu Maui to Big Is. 0.916 0.977 0.977 1.04 1.04 0.977 0.992 0.0461
parameter_type <- "Nm"
parameters <- names(posteriors12) %>%
str_subset(parameter_type)
parameter_key2 <- parameters
names(parameter_key2) <- parameter_key[names(parameter_key) %in% parameter_key2]
#summarize the posterior for theta
nms <- summarize_posterior(posteriors12, parameter_type = parameter_type,
prior_params = c(0,1,1), n = 2^17)
#convert to long format
nms_long <- nms %>% select(all_of(c("x","prior",parameters))) %>%
pivot_longer(starts_with(c(paste0(parameter_type,"_"))),
names_to = "Parameter",
values_to = "Density") %>%
filter(Parameter != "Nm_6_1")
migration.colors <- c(rbind(theta.colors, theta.colors.dark))
#plot
Nm_density <- ggplot(nms_long, aes(x = x, y = Density,
color = Parameter)) +
geom_line(size = 1) +
#scale_x_log10(limits = c(5e-4,1e-2)) +
scale_color_discrete(type = migration.colors, labels = parameter_key) +
geom_line(aes(x = x, y=prior), color = "grey",
linetype="dotted") +
ylim(0,0.125) + xlim(0,0.003)
#plot
Nm_ridges <- ggplot(nms_long, aes(x = x, y = Parameter, height = Density,
fill = Parameter)) +
geom_density_ridges(stat = "identity") +
scale_fill_discrete(type = migration.colors, labels = parameter_key) +
scale_y_discrete(limits = rev, labels = NULL, breaks = NULL) +
labs(x = TeX("$4N_em$")) +
xlim(0,0.003)
Nm_violins <- ggplot(nms_long, aes(x = Parameter, y = x, violinwidth = Density*20,
fill = Parameter)) +
geom_violin(stat = "identity") +
scale_fill_discrete(type = migration.colors, labels = parameter_key) +
ylim(0,0.003)
# get stats for each summed-over-loci posterior
Nm_stats <- parameter_key2 %>% map_dfr(.f = ~ posterior_stats(df = nms,
parameter = .x),
.id = "parameter")
#ggsave(filename = "Nm_density.jpg",plot=Nm_density, path = "./figures/stepping.stone.KOd")
#ggsave(filename = "Nm_ridges.jpg",plot=Nm_ridges, path = "./figures/stepping.stone.KOd")
#ggsave(filename = "Nm_violins.jpg",plot=Nm_violins, path = "./figures/stepping.stone.KOd")
#write_csv(Nm_stats,"./tables/stepping.stone.KOd/Nm_stats.csv")
\(4N_em\) estimates:
nmstats <- read_csv("./tables/stepping.stone.KOd/Nm_stats.csv")
## Rows: 14 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): parameter
## dbl (8): 2.5%, 25%, 50%, 75%, 97.5%, mode, mean, sd
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nmstats
## # A tibble: 14 × 9
## parameter `2.5%` `25%` `50%` `75%` `97.5%` mode mean sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4Nem Maro to… 4.73e-4 5.57e-4 6.10e-4 6.64e-4 7.86e-4 6.03e-4 6.14e-4 8.12e-5
## 2 4Nem P&H to … 5.42e-4 6.18e-4 6.64e-4 7.02e-4 7.93e-4 6.56e-4 6.62e-4 6.50e-5
## 3 4Nem FFS to … 2.06e-4 2.37e-4 2.52e-4 2.67e-4 3.05e-4 2.44e-4 2.51e-4 2.71e-5
## 4 4Nem Maro to… 1.17e-3 1.32e-3 1.40e-3 1.50e-3 1.71e-3 1.39e-3 1.41e-3 1.39e-4
## 5 4Nem Kauai t… 5.19e-4 5.95e-4 6.33e-4 6.79e-4 7.71e-4 6.26e-4 6.36e-4 6.46e-5
## 6 4Nem FFS to … 1.21e-3 1.41e-3 1.53e-3 1.66e-3 1.95e-3 1.48e-3 1.54e-3 1.87e-4
## 7 4Nem Oahu to… 1.10e-3 1.29e-3 1.40e-3 1.50e-3 1.77e-3 1.41e-3 1.40e-3 1.71e-4
## 8 4Nem Kauai t… 9.23e-4 1.06e-3 1.14e-3 1.24e-3 1.43e-3 1.13e-3 1.15e-3 1.28e-4
## 9 4Nem Molokai… 8.54e-4 9.77e-4 1.05e-3 1.13e-3 1.30e-3 1.04e-3 1.06e-3 1.17e-4
## 10 4Nem Oahu to… 6.94e-4 7.86e-4 8.39e-4 9.00e-4 1.03e-3 8.32e-4 8.47e-4 8.83e-5
## 11 4Nem Maui to… 1.10e-3 1.27e-3 1.36e-3 1.46e-3 1.66e-3 1.36e-3 1.36e-3 1.46e-4
## 12 4Nem Molokai… 4.27e-4 4.81e-4 5.11e-4 5.42e-4 6.10e-4 5.04e-4 5.11e-4 4.87e-5
## 13 4Nem Big Is.… 1.11e-3 1.24e-3 1.31e-3 1.39e-3 1.56e-3 1.30e-3 1.32e-3 1.15e-4
## 14 4Nem Maui to… 1.25e-3 1.76e-3 2.11e-3 2.37e-3 2.80e-3 2.23e-3 2.07e-3 4.23e-4
#### Divergence Estimate Oahu-Kure
This is the divergence
dens <- tibble(.rows = 2^15, x = density(getx[,1], n=2^15,
from = 0,
to = 0.0001,
bw = "nrd0")$x)
dens2 <- posteriors12 %>%
select(starts_with(c("Steps","Locus","rep",
paste0(parameter_type,"_")))) %>%
pivot_wider(names_from = "Locus", values_from =
starts_with(paste0(parameter_type,"_")),
names_sep = "-") %>%
select(4:112) %>%
map_dfc(function(x) density(x, n = 2^15, from = 0, to = 0.001, bw = "nrd0")$y)
dens2<-bind_cols(dens,dens2)
dens2$prior <- dexp(dens2$x, rate = 1/0.001,log = F)
dens2$prior <- dens2$prior/sum(dens2$prior)
dens2$logPrior <- log(dens2$prior)
dens4 <- dens2 %>%
#remove the prior, standardize
mutate(across(starts_with(parameter_type),~ remove_prior(densityd = .x,
prior = dens2$prior,threshold = 1e-10) ))
names(dens4)[1:109] <- paste0("D_6_1-",names(dens4)[1:109])
dens5 <- sum_over_loci(df = dens4, parameter = "D_6_1")
dens5 <- bind_cols(dens5, x=dens$x)
#convert to long format
D_61long <- dens5 %>%
pivot_longer(starts_with(c(paste0("D","_"))),
names_to = "Parameter",
values_to = "Density") %>%
filter(Parameter == "D_6_1")
D_61long <- bind_cols(D_61long,x=dens$x)
#migration.colors <- c(rbind(theta.colors, theta.colors.dark))
#plot
D_61density <- ggplot(D_61long, aes(x = x, y = Density,
color = Parameter)) +
geom_line(size = 1) +
#scale_x_log10(limits = c(5e-5,1e-2)) +
scale_color_discrete(type = migration.colors, labels = parameter_key) +
#geom_line(aes(x = x, y=prior), color = "grey",
# linetype="dotted") +
ylim(0,0.0002) + xlim(0,0.0002)
# get stats for each summed-over-loci posterior
D_61stats <-posterior_stats(df = dens5, parameter = "D_6_1")
#ggsave(filename = "D_6_1.jpg",plot=D_61density, path = "./figures/stepping.stone.KOd")
#write.csv(D_61stats ,"./tables/stepping.stone.KOd/D_6_1.csv",quote=F)
D61stats <- read.csv("./tables/stepping.stone.KOd/D_6_1.csv")
D61stats
## X x
## 1 2.5% 5.306864e-05
## 2 25% 6.368908e-05
## 3 50% 6.983551e-05
## 4 75% 7.629627e-05
## 5 97.5% 8.960540e-05
## 6 mode 6.932279e-05
## 7 mean 7.021043e-05
## 8 sd 9.304214e-06
Divergence time in coalescent units, following along on page 25 here. Beerli et al. (2019)
tau <- 6.932279e-05 / (0.00748+0.00423)
tau
## [1] 0.005919965
t <- tau / 1.2e-8